# Function to extract only variables of interest
estrai_valori <- function(file_path) {
dataset <- read.csv(file_path)
risultati <- c()
variabili <- coordinate[, 2] # Variable names to extract
righe <- coordinate[, 4] # Corresponding rows
for (i in seq_along(variabili)) {
var <- variabili[i]
if (var %in% colnames(dataset)) {
risultati <- c(risultati, dataset[righe[i], var])
} else {
risultati <- c(risultati, NA)
}
}
return(risultati)
}The Foreign Language Effect on the Illusion of Causality: A Replication Attempt and an Exploratory Analysis of Underlying Mechanisms. Formal analysis document
Keywords: Illusion of Causality, Foreign Language Effect, Cognitive Bias
The Foreign Language Effect on the Illusion of Causality: A Replication Attempt and an Exploratory Analysis of Underlying Mechanisms. Formal analysis document
1. Data Import and Processing
1.1 Importing Raw Data to Create the Dataset
To construct the dataset, we employed the pre-registered R pipeline estrai_valori (available on the OSF project page). This pipeline extracts only the relevant variables of interest from each participant’s raw data file.
To function correctly, the pipeline requires an auxiliary file specifying the appropriate variable mappings and extraction coordinates within each individual data frame. This file, titled ExperimentVariables.csv, is available at the OSF project page.
# Reading coordinate mapping
coordinate <- read.csv("da.csv")
# Retrieving paths to raw data files in the "Analisi" sub-folder
file_paths <- list.files(
path = file.path(getwd(), "Analisi"),
full.names = TRUE
)
# Applying the function to each file
risultati <- lapply(file_paths, estrai_valori)
# Combining results into a dataframe
df_risultati <- do.call(rbind, risultati)
# Assigning column names based on coordinate file
colnames(df_risultati) <- coordinate[, 3]1.2 Data Processing
After combining the extracted data into a single data frame, variables were converted to their appropriate data types (numeric, logical, or factor). Trial numeric evaluation variables (i.e., trial_xx_y) were consolidated into a single column, as the Italian (trial_xx_i) and English (trial_xx_e) trial evaluations reflect the same underlying construct.
Responses in the objective English test variables were re-coded: a value of 1 was assigned to correct responses and 0 to incorrect ones.
Descriptive variable labels were assigned based on the fifth column of the coordinate file, thereby enhancing the interpretability of the dataset. A visualization was generated to present the variables and their interpretations. Additionally, a time variable for the trial number evaluation (i.e., t_trial) was created by computing the difference between the total time recorded after and before the experimental routine.
# Convert the result to a proper data frame
df_risultati <- as.data.frame(df_risultati)
# Duplicate the data frame for labeling
df1 <- df_risultati
# Factor variables
df1[c("participant", "condition", "sesso", "studi",
"scolarita", "diploma_e", "altrodiploma")] <-
lapply(df1[c("participant", "condition", "sesso", "studi",
"scolarita", "diploma_e", "altrodiploma")], as.factor)
# Logical variables
df1[c("linguanativa_i", "linguanativa_e", "estero")] <-
lapply(df1[c("linguanativa_i",
"linguanativa_e", "estero")], as.logical)
# Numeric variables
num_vars <- c(
"t_consenso", "t_sam", "t_istruzioni", "intensita", "valenza",
"t_task", "t_causalita", "causalita", "t_esperienza", "esperienza",
"t_pf", "t_pf_cum_inizio", "t_pf_cum_fine", "totaltime",
"pf", "eta", "AOA_e", "AOA_i",
"lett_e", "comp_e", "scri_e", "parl_e",
"lett_i", "comp_i", "scri_i", "parl_i"
)
df1[num_vars] <- lapply(df1[num_vars], as.numeric)
# Replace NA in *_e trial columns with *_i values
names(df1)[17] <- "trial_ss_e"
for (cond in c("nn", "ns", "sn", "ss")) {
e_col <- paste0("trial_", cond, "_e")
i_col <- paste0("trial_", cond, "_i")
df1[[e_col]][is.na(df1[[e_col]])] <-
df1[[i_col]][is.na(df1[[e_col]])]
}
df1$trial_nn_e <- as.numeric(df1$trial_nn_e)
df1$trial_ns_e <- as.numeric(df1$trial_ns_e)
df1$trial_sn_e <- as.numeric(df1$trial_sn_e)
df1$trial_ss_e <- as.numeric(df1$trial_ss_e)
# Convert PI columns to factors
pi_vars <- c(paste0("PI_", 1:25), "PI_26_controllo")
df1[pi_vars] <- lapply(df1[pi_vars], as.factor)
# Convert to 1 if "ok" is in the string, else 0
pi_vars <- c(paste0("PI_", 1:25), "PI_26_controllo")
df1[pi_vars] <- lapply(df1[pi_vars],
function(x) as.numeric(grepl(
"ok", x, ignore.case = TRUE)))
# Apply the labels to the data frame
df_names <- data.frame(name = rep(NA, ncol(df1)),
label = rep(NA, ncol(df1)))
for (i in 1:ncol(df1)){
attr(df1[,i], "variable.labels") <- coordinate[i,5]
df_names[i,1] <- colnames(df1)[i]
df_names[i,2] <- attr(df1[,i],"variable.labels")
}
# Creating time variable for trial numeric evaluation
df1$t_trial <- df1$t_pf_cum_inizio - df1$t_pf_cum_fine
# Visualizing the variables and their meaning
library(DT)
datatable(df_names,
options = list(scrollX = TRUE, pageLength = 5),
class = 'cell-border stripe')2. Data Exclusion
2.1 Preliminary Observation
Before applying the exclusion criteria we observe how many data points we have. We have a total of 241 participants. 198 participants have been recruited from prolific, while 43 were recruited at our University.
# Count the number of participants
nrow(df1)[1] 241
sum(df1$participant =="[(%PROLIFIC_PID")[1] 198
2.2 Exclusion Criteria
Exclusion criteria were applied in accordance with our pre-registration (available on the OSF project page). The participant 101 was excluded due to incomplete data. Participants who completed the instruction phase in under 10 seconds (n = 4) were also excluded, as were those who responded to any of the key questions – emotional engagement, causality evaluation, personal experience with medicines, or processing fluency – in under 2 seconds (n = 0). 0 participants completed the trial section under 160 seconds.
Additional exclusions were made based on language background: participants who reported not being native Italian speakers (n = 2), those who identified English as their native language (n = 6), those holding a C2 or A1 level English language certificate (n = 2), and those currently residing in an English-speaking country (n = 1).
In the English condition, participants who failed the objective English test (i.e., fewer than 7 correct responses) were excluded (n = 1). Moreover, participants who achieved a perfect score (25/25) were excluded (n = 6).
One participant who reported an A1 level of English was not excluded, as they scored well on the objective English test and self-reported maximum English competence across all subjective items (score = 10 on all). Finally, one online participant who completed the experiment over a duration of three hours was excluded due to concerns about engagement and data quality. After appplying the exclusion criteria we remained with 220 participants (110 per group).
# Compute total correct answers on the objective English test
df1$ok_total <- rowSums(df1[, paste0("PI_", 1:25)])
# Exclude the 101 participant (incomplete data)
df1 <- df1[-(which(is.na(df1$pf))), ]
# Count exclusions for reporting
exclusion_counts <- c(
1, # Incomplete data (64th participant)
sum(df1$t_istruzioni < 10), # Instructions completed < 10 sec
sum(df1$t_task < 10), # tTask completed < 160 sec
sum(df1$t_sam < 2 | df1$t_causalita < 2 |
df1$t_esperienza < 2 | df1$t_pf < 2|
(df1$t_pf_cum_inizio - df1$t_pf_cum_fine) < 2),
# Response time < 2 sec on key tasks
sum(df1$linguanativa_i == F), # Not native Italian speaker
sum(df1$linguanativa_e == TRUE), # English as native language
sum(df1$estero == TRUE), # Living in English-speaking country
sum(grepl("A1", df1$punteggiodiploma,
ignore.case = TRUE) | # Having A1
grepl("C2", df1$punteggiodiploma,
ignore.case = TRUE)), # Having C2
sum(df1$ok_total < 7 &
df1$condition == "2"), # Objective test score < 7 (English condition)
sum(df1$ok_total == 25 &
df1$condition == "2") # Objective test perfect score (25/25)
)
exclusion_df <- data.frame(
"Exclusion_Criterion" = c(
"Incomplete data (64th participant)",
"Instructions completed < 10 sec",
"Trials completed < 160 sec",
"Response time < 2 sec on key tasks",
"Not native Italian speaker",
"English as native language",
"Living in English-speaking country",
"C2 or A1 english level",
"Objective test score < 7 (English condition)",
"Objective test perfect score (25/25) in English condition"
),
"Number_Participants" = exclusion_counts,
stringsAsFactors = FALSE
)
# Show as table
knitr::kable(exclusion_df, align = "l")| Exclusion_Criterion | Number_Participants |
|---|---|
| Incomplete data (64th participant) | 1 |
| Instructions completed < 10 sec | 4 |
| Trials completed < 160 sec | 0 |
| Response time < 2 sec on key tasks | 0 |
| Not native Italian speaker | 2 |
| English as native language | 6 |
| Living in English-speaking country | 1 |
| C2 or A1 english level | 2 |
| Objective test score < 7 (English condition) | 1 |
| Objective test perfect score (25/25) in English condition | 6 |
# Exclude based on instruction time < 10 seconds
df1 <- df1[df1$t_istruzioni >= 10, ]
# Exclude responses under 2 seconds in any key task
df1 <- df1[df1$t_sam >= 2 | df1$t_causalita >= 2 |
df1$t_esperienza >= 2 | df1$t_pf >= 2, ]
# Exclude non-native Italian speakers
df1 <- df1[df1$linguanativa_i == TRUE, ]
# Exclude native English speakers
df1 <- df1[df1$linguanativa_e == FALSE, ]
# Exclude those living abroad in English-speaking countries
df1 <- df1[df1$estero == FALSE, ]
# Exclude participants in English condition who failed (ok_total < 7)
df1 <- df1[!(df1$condition == "2" & df1$ok_total < 7), ]
# Exclude participants in English condition with a perfect score
#(ok_total == 25)
df1 <- df1[!(df1$condition == "2" & df1$ok_total == 25), ]
# Do not exclude the A1 level participant
df1[(grepl("A1", df1$punteggiodiploma, ignore.case = TRUE)),37:40] lett_e comp_e scri_e parl_e
66 10 10 10 10
df1$ok_total[grepl("A1", df1$punteggiodiploma, ignore.case = TRUE)][1] 23
# C2 level participant already excluded
sum(grepl("C2", df1$punteggiodiploma, ignore.case = TRUE))[1] 0
# Exclude a participant that completed the experiment in 3 hours
sum(df1$totaltime >3000)[1] 3
which(df1$totaltime >3000)[1] 2 29 200
df1[c(which(df1$totaltime >3000)),74] / (60*60)[1] 0.9508141 0.8539070 3.0698939
df1 <- df1[-200, ]
# Count the number of participants
nrow(df1)[1] 220
sum(df1$participant =="[(%PROLIFIC_PID")[1] 182
2.2 Attention Check Failure
We assessed how many participants failed the control question (PI_26). A total of 43 participants had a score of 0 on this item. We then conducted a qualitative, manual inspection of their responses to identify any substantial indicators of inattention or anomalous behavior. This preliminary qualitative assessment revealed no consistent patterns that suggested participant inattention.
# Check the number of participants who failed the control question
nrow(df1[df1$PI_26_controllo == 0, ])[1] 43
datatable(df1[df1$PI_26_controllo == 0, ],
options = list(scrollX = TRUE, pageLength = 5),
class = 'cell-border stripe')We analyzed potential differences between participants who failed the control question and those who did not. A comparison of time-related variables across experimental phases revealed no substantial differences between the two groups. Based on these findings, we conclude that the data are reliable and may be retained for inclusion in the final analysis.
# Create ggpairs plot
check1 <-df1[df1$PI_26_controllo == 0, ]
check0 <-df1[df1$PI_26_controllo != 0, ]
library(GGally)
library(ggokabeito)
library(ggplot2)
# Add group labels
check0$group <- "Control question: N"
check1$group <- "Control question: Y"
# Combine data
combined <- rbind(check0, check1)
# Select only numeric variables + group
numeric_vars <- sapply(combined, is.numeric)
combined_numeric <- combined[, numeric_vars]
combined_numeric$group <- combined$group
ggpairs(combined_numeric[,c(2:3,6:7,9,57,15,59)],
aes(color = group, alpha = 0.5))+
theme_bw()+scale_color_okabe_ito()+scale_fill_okabe_ito()ggparcoord(combined_numeric, columns = c(2:3,6:7,9,57,15),
groupColumn = 59, showPoints = TRUE, boxplot = T,
alphaLines = 0.2)+
theme_bw()+scale_color_okabe_ito()+scale_fill_okabe_ito()3. Describing the sample
3.1 Recruiting from Two Sources
Participants were recruited from two different sources, resulting in differences in sample composition. In the Prolific sample (N = 182), more participants took part in the second condition (Condition 1: N = 85; Condition 2: N = 97). In contrast, in the University sample (N = 38), more participants were assigned to the first condition (Condition 1: N = 25; Condition 2: N = 13).
A Pearson’s Chi-squared test with Yates’ continuity correction indicates that the association between recruitment source and condition assignment is statistically significant but small (χ² = 3.85, p = 0.050; Adjusted Cramér’s V = 0.13, 95% CI [0.00, 1.00]).
# Recode participant variable for labeling
df1$participant <- as.factor(ifelse(
df1$participant == "[(%PROLIFIC_PID", "P", "U"))
# Display contingency table
table(df1$participant)
P U
182 38
table(df1$condition, df1$participant)
P U
1 85 25
2 97 13
# Plot mosaic plot
library(ggmosaic)
ggplot(data = df1) +
geom_mosaic(mapping = aes(x = product(condition, participant),
fill = participant)) +
theme_bw() +
labs(x = "Condition", y = "Count", fill = "Participant Source") +
scale_fill_viridis_d(alpha = 0.2, labels = c("University", "Prolific"))# Chi-squared test with parameter estimates
library(parameters)
chisq_result <- chisq.test(as.matrix(table(df1$participant,
df1$condition)))
chisq_result
Pearson's Chi-squared test with Yates' continuity correction
data: as.matrix(table(df1$participant, df1$condition))
X-squared = 3.849, df = 1, p-value = 0.04977
model_parameters(prop.test(as.matrix(table(df1$participant,
df1$condition))))2-sample test for equality of proportions
Proportion | Difference | 95% CI | Chi2(1) | p
---------------------------------------------------------------
46.70% / 65.79% | 19.09% | [-0.37, -0.01] | 3.85 | 0.050
Alternative hypothesis: two.sided
3.1.1 Recruitment along time
We provide a graphical representation of the recruitment process over time, distinguishing between our two sources of recruitment (Prolific vs. University). The majority of participants were recruited through Prolific during two main data collection waves: the first in April 2025 (N = 166) and the second in May 2025 (N = 16). The remaining participants (N = 38) were recruited from the university sample over multiple weeks across April and May 2025.
# Showing calendar data (using dplyr pipeline)
library(dplyr)
library(lubridate)
df1 <- df1 %>%
mutate(
date_time = ymd_hms(gsub("_", " ", gsub("h", ":", date))),
day = as.Date(date_time)
)
calendar_data <- df1 %>%
mutate(
year = year(day),
month = month(day, label = TRUE),
weekday = wday(day, label = TRUE, week_start = 1),
week = ceiling(day(day) / 7),
group = participant
) %>%
count(year, month, week, weekday, group)
datatable(calendar_data,
options = list(scrollX = TRUE, pageLength = 5),
class = 'cell-border stripe')# Calendar of participation dates
ggplot(calendar_data, aes(x = weekday, y = week, fill = n)) +
geom_tile(color = "black", size = 0.2) +
scale_fill_gradient(low = "white", high = "darkred") +
facet_grid(group ~ month + year, scales = "free_x", space = "free_x") +
labs(x = "Day", y = "Week", fill = "N participants") +
theme_bw() +
theme(
strip.text = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1),
panel.spacing = unit(1, "lines")
)3.2 Age
3.2.1 Age (Univariate)
We provided summary statistics and a visual representation of age. The shape of the distribution was then assessed.
# Summary statistics
library(pastecs)
summary(df1$eta) Min. 1st Qu. Median Mean 3rd Qu. Max.
20.00 25.00 29.00 32.65 39.25 65.00
round(stat.desc(df1$eta, norm = TRUE), 2) nbr.val nbr.null nbr.na min max range
220.00 0.00 0.00 20.00 65.00 45.00
sum median mean SE.mean CI.mean.0.95 var
7183.00 29.00 32.65 0.72 1.41 112.68
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
10.61 0.33 0.98 2.98 0.18 0.28
normtest.W normtest.p
0.90 0.00
# Plot
ggplot(df1, aes(x = eta)) +
geom_boxplot(width = 0.004, position = position_nudge(y = -0.005),
outlier.alpha = 0, fill = "darkorange", alpha = 0.5) +
geom_density(fill = "darkorange", alpha = 0.3) +
geom_jitter(mapping = aes(x = eta, y = -0.01),
position = position_jitter(height = 0.002),
color = "darkorange", alpha = 0.5) +
labs(x = "Age", y = "Density") +
theme_bw()# Assess distribution
library(fitdistrplus)
descdist(df1$eta)summary statistics
------
min: 20 max: 65
median: 29
mean: 32.65
estimated sd: 10.6149
estimated skewness: 0.9908064
estimated kurtosis: 3.243691
# Fit distributions
fnorm <- fitdist(df1$eta, "norm")
fgamma <- fitdist(df1$eta, "gamma")
fpois <- fitdist(df1$eta, "pois")
# Compare distributions
plot.legend <- c("Normal", "Gamma", "Poisson")
par(mfrow = c(2, 2))
denscomp(list(fnorm, fgamma, fpois), legendtext = plot.legend)
qqcomp(list(fnorm, fgamma, fpois), legendtext = plot.legend)
cdfcomp(list(fnorm, fgamma, fpois), legendtext = plot.legend)
ppcomp(list(fnorm, fgamma, fpois), legendtext = plot.legend)# Goodness-of-fit statistics
gofstat(list(fnorm, fgamma, fpois), fitnames = plot.legend)Goodness-of-fit statistics
Normal Gamma Poisson
Kolmogorov-Smirnov statistic 0.1529995 0.1273425 0.2377533
Cramer-von Mises statistic 1.2011263 0.7199394 4.1318193
Anderson-Darling statistic 6.9511382 4.1959793 43.8224194
Goodness-of-fit criteria
Normal Gamma Poisson
Akaike's Information Criterion 1666.724 1627.330 1871.200
Bayesian Information Criterion 1673.512 1634.118 1874.594
3.2.2 Age differences between groups
To explore age differences between participant (Prolific vs. University) and groups (1 vs. 2), we began by visualizing and summarizing descriptive statistics of age across groups. We then fitted a series of generalized linear models. These models included different combinations of predictors (participant, condition, and their interaction). We used model selection criteria – Akaike Information Criterion (AIC) and Bayesian Information Criterion (BIC) – to identify the best-fitting model. Finally, we visualized and interpreted the selected model to understand how participant and group predicts age.
# Age by participant
by(df1$eta, df1$participant, function(x) summary(x)) df1$participant: P
Min. 1st Qu. Median Mean 3rd Qu. Max.
22.00 26.00 32.00 34.85 42.75 65.00
------------------------------------------------------------
df1$participant: U
Min. 1st Qu. Median Mean 3rd Qu. Max.
20.00 20.00 21.00 22.13 23.00 29.00
# Age by condition
by(df1$eta, df1$condition, function(x) summary(x))df1$condition: 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
20.00 24.00 28.00 31.56 37.00 65.00
------------------------------------------------------------
df1$condition: 2
Min. 1st Qu. Median Mean 3rd Qu. Max.
20.00 26.00 30.50 33.74 41.00 62.00
# Age by both
by(df1$eta, list(df1$participant, df1$condition), function(x) summary(x)) : P
: 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
22.00 26.00 32.00 34.34 39.00 65.00
------------------------------------------------------------
: U
: 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
20.00 20.00 21.00 22.12 23.00 29.00
------------------------------------------------------------
: P
: 2
Min. 1st Qu. Median Mean 3rd Qu. Max.
22.00 27.00 31.00 35.29 43.00 62.00
------------------------------------------------------------
: U
: 2
Min. 1st Qu. Median Mean 3rd Qu. Max.
20.00 20.00 21.00 22.15 23.00 29.00
# Descriptive stat function
DESC <- function(x) round(stat.desc(x, norm = TRUE), 2)
# Age by participant
by(df1$eta, df1$participant, DESC)df1$participant: P
nbr.val nbr.null nbr.na min max range
182.00 0.00 0.00 22.00 65.00 43.00
sum median mean SE.mean CI.mean.0.95 var
6342.00 32.00 34.85 0.77 1.51 106.87
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
10.34 0.30 0.89 2.47 -0.04 -0.06
normtest.W normtest.p
0.91 0.00
------------------------------------------------------------
df1$participant: U
nbr.val nbr.null nbr.na min max range
38.00 0.00 0.00 20.00 29.00 9.00
sum median mean SE.mean CI.mean.0.95 var
841.00 21.00 22.13 0.42 0.85 6.77
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
2.60 0.12 1.28 1.67 0.58 0.39
normtest.W normtest.p
0.79 0.00
# Age by condition
by(df1$eta, df1$condition, DESC)df1$condition: 1
nbr.val nbr.null nbr.na min max range
110.00 0.00 0.00 20.00 65.00 45.00
sum median mean SE.mean CI.mean.0.95 var
3472.00 28.00 31.56 1.00 1.98 109.28
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
10.45 0.33 1.15 2.49 0.76 0.83
normtest.W normtest.p
0.88 0.00
------------------------------------------------------------
df1$condition: 2
nbr.val nbr.null nbr.na min max range
110.00 0.00 0.00 20.00 62.00 42.00
sum median mean SE.mean CI.mean.0.95 var
3711.00 30.50 33.74 1.02 2.02 114.73
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
10.71 0.32 0.82 1.78 -0.29 -0.32
normtest.W normtest.p
0.91 0.00
# Age by both
by(df1$eta, list(df1$participant, df1$condition), DESC): P
: 1
nbr.val nbr.null nbr.na min max range
85.00 0.00 0.00 22.00 65.00 43.00
sum median mean SE.mean CI.mean.0.95 var
2919.00 32.00 34.34 1.11 2.22 105.54
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
10.27 0.30 1.01 1.94 0.39 0.38
normtest.W normtest.p
0.90 0.00
------------------------------------------------------------
: U
: 1
nbr.val nbr.null nbr.na min max range
25.00 0.00 0.00 20.00 29.00 9.00
sum median mean SE.mean CI.mean.0.95 var
553.00 21.00 22.12 0.52 1.07 6.69
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
2.59 0.12 1.19 1.28 0.31 0.17
normtest.W normtest.p
0.80 0.00
------------------------------------------------------------
: P
: 2
nbr.val nbr.null nbr.na min max range
97.00 0.00 0.00 22.00 62.00 40.00
sum median mean SE.mean CI.mean.0.95 var
3423.00 31.00 35.29 1.06 2.10 108.73
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
10.43 0.30 0.77 1.58 -0.43 -0.44
normtest.W normtest.p
0.91 0.00
------------------------------------------------------------
: U
: 2
nbr.val nbr.null nbr.na min max range
13.00 0.00 0.00 20.00 29.00 9.00
sum median mean SE.mean CI.mean.0.95 var
288.00 21.00 22.15 0.76 1.65 7.47
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
2.73 0.12 1.27 1.03 0.47 0.20
normtest.W normtest.p
0.79 0.00
# Participant
p1 <- ggplot(df1, aes(x = eta)) +
geom_boxplot(width = 0.02, position = position_nudge(y = -0.03),
outlier.alpha = 0, fill = "darkorange", alpha = 0.5) +
geom_density(fill = "darkorange", alpha = 0.3) +
geom_jitter(aes(y = -0.01),
position = position_jitter(height = 0.004),
color = "darkorange", alpha = 0.5) +
labs(x = "Age", y = "Density", title = "Participants") +
facet_wrap(~participant, labeller = label_both) +
theme_bw()
# Condition
p2 <- ggplot(df1, aes(x = eta)) +
geom_boxplot(width = 0.004, position = position_nudge(y = -0.007),
outlier.alpha = 0, fill = "steelblue", alpha = 0.5) +
geom_density(fill = "steelblue", alpha = 0.3) +
geom_jitter(aes(y = -0.003),
position = position_jitter(height = 0.001),
color = "steelblue", alpha = 0.5) +
labs(x = "Age", y = "Density", title = "Condition") +
facet_wrap(~condition, labeller = label_both) +
theme_bw()
# Participant × Condition
p3 <- ggplot(df1, aes(x = eta)) +
geom_boxplot(width = 0.02, position = position_nudge(y = -0.03),
outlier.alpha = 0, fill = "darkgreen", alpha = 0.5) +
geom_density(fill = "darkgreen", alpha = 0.3) +
geom_jitter(aes(y = -0.01),
position = position_jitter(height = 0.002),
color = "darkgreen", alpha = 0.5) +
labs(x = "Age", y = "Density", title = "Participant × Condition") +
facet_grid(participant ~ condition, labeller = label_both) +
theme_bw()
# Combining the plots together
library(patchwork)
combined_plot <- p1 / p2 / p3 + plot_layout(guides = "collect") &
theme(legend.position = "none")
combined_plot#Models comparison with gamma (link function identity)
# Null model
m0 <- glm(eta ~ 1, data = df1, family = Gamma(link = "identity"))
# Condition
m1 <- glm(eta ~ condition, data = df1,
family = Gamma(link = "identity"))
# Participant
m2 <- glm(eta ~ participant, data = df1,
family = Gamma(link = "identity"))
# Additive model
m3 <- glm(eta ~ condition + participant, data = df1,
family = Gamma(link = "identity"))
# Interaction model
m4 <- glm(eta ~ condition * participant, data = df1,
family = Gamma(link = "identity"))
# Model selection (AIC, BIC)
library(MuMIn)
model.sel(m0, m1, m2, m3, m4, rank = AIC)Model selection table
(Int) cnd prt cnd:prt df logLik AIC delta weight
m2 34.85 + 3 -775.136 1556.3 0.00 0.621
m3 34.49 + + 4 -774.961 1557.9 1.65 0.272
m4 34.34 + + + 5 -774.888 1559.8 3.50 0.108
m1 31.56 + 3 -810.372 1626.7 70.47 0.000
m0 32.65 2 -811.679 1627.4 71.09 0.000
Models ranked by AIC(x)
model.sel(m0, m1, m2, m3, m4, rank = BIC)Model selection table
(Int) cnd prt cnd:prt df logLik BIC delta weight
m2 34.85 + 3 -775.136 1566.5 0.00 0.921
m3 34.49 + + 4 -774.961 1571.5 5.04 0.074
m4 34.34 + + + 5 -774.888 1576.7 10.29 0.005
m0 32.65 2 -811.679 1634.1 67.69 0.000
m1 31.56 + 3 -810.372 1636.9 70.47 0.000
Models ranked by BIC(x)
# Model diagnostics
library(performance)
check_model(m2) # Model predictive check
check_predictions(m2)# Model summary
summary(m2)
Call:
glm(formula = eta ~ participant, family = Gamma(link = "identity"),
data = df1)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 34.8462 0.7094 49.12 <2e-16 ***
participantU -12.7146 1.2146 -10.47 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Gamma family taken to be 0.07542025)
Null deviance: 20.954 on 219 degrees of freedom
Residual deviance: 15.098 on 218 degrees of freedom
AIC: 1556.3
Number of Fisher Scoring iterations: 3
# ES Cohen's d
library(effectsize)
cohens_d(df1$eta[df1$participant=="P"],df1$eta[df1$participant=="U"])Cohen's d | 95% CI
------------------------
1.34 | [0.97, 1.71]
- Estimated using pooled SD.
# ES Cliff's Delta
cliffs_delta(df1$eta[df1$participant=="P"],df1$eta[df1$participant=="U"])r (rank biserial) | 95% CI
--------------------------------
0.88 | [0.83, 0.92]
# Report results in console
library(report)
report(m2)We fitted a general linear model (Gamma family with a identity link) (estimated
using ML) to predict eta with participant (formula: eta ~ participant). The
model's explanatory power is substantial (Nagelkerke's R2 = 0.29). The model's
intercept, corresponding to participant = P, is at 34.85 (95% CI [33.49,
36.27], t(218) = 49.12, p < .001). Within this model:
- The effect of participant [U] is statistically significant and negative (beta
= -12.71, 95% CI [-15.04, -10.26], t(218) = -10.47, p < .001; Std. beta =
-12.71, 95% CI [-15.04, -10.26])
Standardized parameters were obtained by fitting the model on a standardized
version of the dataset. 95% Confidence Intervals (CIs) and p-values were
computed using a Wald t-distribution approximation.
3.2.3 Age differences between experiments
We compared the age composition with that of the original experiment.
# Importing the dataset of the original experiment
datacomplete <- read.csv2("datasetFLE.csv")
# Converting to factor some variables
datacomplete$experiment <- as.factor(datacomplete$experiment)
datacomplete$gender <- as.factor(datacomplete$gender)
datacomplete$nativeLanguage <- as.factor(datacomplete$nativeLanguage)
datacomplete$experimentLanguage <- as.factor(datacomplete$experimentLanguage)
datacomplete$contingency <- as.factor(datacomplete$contingency)
# Descriptive statistics of the age in the original experiment (2)
summary(datacomplete$age[datacomplete$experiment=="Experiment2"]) Min. 1st Qu. Median Mean 3rd Qu. Max.
16.00 18.00 20.00 23.81 24.50 53.00
round(stat.desc(datacomplete$age[datacomplete$experiment=="Experiment2"],
norm= T),2) nbr.val nbr.null nbr.na min max range
80.00 0.00 0.00 16.00 53.00 37.00
sum median mean SE.mean CI.mean.0.95 var
1905.00 20.00 23.81 1.03 2.06 85.39
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
9.24 0.39 1.87 3.47 2.52 2.37
normtest.W normtest.p
0.70 0.00
# Visualizing the combined dataframe
data_age <- data.frame(
exp = as.factor(c(rep("0riginal",length(datacomplete$age[
datacomplete$experiment=="Experiment2"])), rep("Replication", 220))),
group = as.factor(c(rep("0riginal",length(datacomplete$age[
datacomplete$experiment=="Experiment2"])), rep("Prolific", 182),
rep("Unviersity", 38))),
age = c(datacomplete$age[datacomplete$experiment=="Experiment2"],
df1$eta[df1$participant=="P"], df1$eta[df1$participant=="U"])
)
datatable(data_age,
options = list(scrollX = TRUE, pageLength = 5),
class = 'cell-border stripe')We calculated measures of effect size.
# ES Cohen's d and Cliff's Delta
cohens_d(data_age$age[data_age$exp=="0riginal"],
data_age$age[data_age$exp=="Replication"], data_age)Cohen's d | 95% CI
--------------------------
-0.86 | [-1.13, -0.59]
- Estimated using pooled SD.
# ES Cliff's Delta
cliffs_delta(data_age$age[data_age$exp=="0riginal"],
data_age$age[data_age$exp=="Replication"], data_age)r (rank biserial) | 95% CI
----------------------------------
-0.62 | [-0.70, -0.52]
# Plot two experiment's age
ggplot(data_age, aes(y = age, x=exp, fill=exp)) +
geom_violin(alpha = 0.5)+
geom_boxplot(outlier.alpha = 0, alpha = 0.7) +
geom_jitter( alpha = 0.5, width=.1) +
labs(y = "Age", x = "Experiment") +
scale_fill_viridis_d(option = "turbo")+
theme_bw()3.3 Biological sex
3.3.1 Biological sex (Univariate)
We provided summary statistics and a visual representation of biological sex.
# Summary ofsex
summary(df1$sesso) Femmina Maschio
102 118
# Plot
ggplot(df1, aes(x = sesso, fill = sesso)) +
geom_bar(alpha = 0.8) +
scale_fill_viridis_d(labels = c("Female", "Male")) +
scale_x_discrete(labels = c("F" = "Female", "M" = "Male")) +
labs(x = "Biological Sex", y = "Number of Participants", fill = "Sex") +
theme_bw()3.3.2 Biological sex between groups
To explore sex differences across participant sources (Prolific vs. University) and experimental conditions (Condition 1 vs. Condition 2), we began by visualizing and summarizing the distribution of sex within each group.
In a first comparison, we assessed differences in sex distribution between Prolific and University participants.
# Summary of sex by participant
by(df1$sesso, df1$participant, summary)df1$participant: P
Femmina Maschio
72 110
------------------------------------------------------------
df1$participant: U
Femmina Maschio
30 8
# Mosaic plot
ggplot(data = df1) +
geom_mosaic(mapping = aes(x = product(sesso, participant),
fill = participant)) +
labs(x = "Sex", y = "Count", fill = "Participant Source") +
theme_bw() +
scale_fill_viridis_d(alpha = 0.2, labels = c("University", "Prolific"))# Chi-squared test and effect size
chisq.test(as.matrix(table(df1$sesso, df1$participant)))
Pearson's Chi-squared test with Yates' continuity correction
data: as.matrix(table(df1$sesso, df1$participant))
X-squared = 18.059, df = 1, p-value = 2.141e-05
model_parameters(prop.test(as.matrix(table(df1$participant, df1$sesso))))2-sample test for equality of proportions
Proportion | Difference | 95% CI | Chi2(1) | p
----------------------------------------------------------------
39.56% / 78.95% | 39.39% | [-0.56, -0.23] | 18.06 | < .001
Alternative hypothesis: two.sided
# Differences using the Bayes Factor
library(BayesFactor)
contingencyTableBF(table(df1$sesso, df1$participant),
sampleType = "jointMulti")Bayes factor analysis
--------------
[1] Non-indep. (a=1) : 4081.47 ±0%
Against denominator:
Null, independence, a = 1
---
Bayes factor type: BFcontingencyTable, joint multinomial
In the second comparison, we examined whether sex distribution differed across experimental conditions.
# Summary of sex by condition
by(df1$sesso, df1$condition, summary)df1$condition: 1
Femmina Maschio
58 52
------------------------------------------------------------
df1$condition: 2
Femmina Maschio
44 66
# Mosaic plot
ggplot(data = df1) +
geom_mosaic(mapping = aes(x = product(sesso, condition), fill = condition)) +
labs(x = "Condition", y = "Sex", fill = "Condition") +
theme_bw() +
scale_fill_viridis_d(alpha = 0.2, labels = c("1", "2"))# Chi-squared test and effect size
chisq.test(as.matrix(table(df1$sesso, df1$condition)))
Pearson's Chi-squared test with Yates' continuity correction
data: as.matrix(table(df1$sesso, df1$condition))
X-squared = 3.0891, df = 1, p-value = 0.07882
model_parameters(prop.test(as.matrix(table(df1$sesso, df1$condition))))2-sample test for equality of proportions
Proportion | Difference | 95% CI | Chi2(1) | p
--------------------------------------------------------------
56.86% / 44.07% | 12.79% | [-0.01, 0.27] | 3.09 | 0.079
Alternative hypothesis: two.sided
# Differences using the Bayes Factor
contingencyTableBF(table(df1$sesso, df1$condition), sampleType = "indepMulti",
fixedMargin = "cols")Bayes factor analysis
--------------
[1] Non-indep. (a=1) : 0.989198 ±0%
Against denominator:
Null, independence, a = 1
---
Bayes factor type: BFcontingencyTable, independent multinomial
3.3.3 Biological sex between experiments
We compared the sex composition with that of the original experiment.
# Summary in the original experiment 2
summary(datacomplete$gender[datacomplete$experiment == "Experiment2"]) man woman
31 49
# Combine data
data_sex <- data.frame(
exp = as.factor(c(
rep("Original", length(datacomplete$gender[
datacomplete$experiment == "Experiment2"])),
rep("Replication", 220)
)),
group = as.factor(c(
rep("Original", length(datacomplete$age[
datacomplete$experiment == "Experiment2"])),
rep("Prolific", 182),
rep("University", 38)
)),
sex = as.factor(as.character(c(
ifelse(datacomplete$gender[
datacomplete$experiment == "Experiment2"] == "man", "Maschio", "Femmina"),
as.character(df1$sesso[df1$participant == "P"]),
as.character(df1$sesso[df1$participant == "U"])
)))
)
# Mosaic plot
ggplot(data_sex) +
geom_mosaic(aes(x = product(sex, exp), fill = exp)) +
labs(x = "Condition", y = "Sex", fill = "Condition") +
theme_bw() +
scale_fill_viridis_d(alpha = 0.6, labels = c("Original", "Replication"))# Chi-square test and report
x <- chisq.test(table(data_sex$sex, data_sex$exp))
report(x)Effect sizes were labelled following Funder's (2019) recommendations.
The Pearson's Chi-squared test with Yates' continuity correction of
independence between and suggests that the effect is statistically significant,
and small (chi2 = 4.62, p = 0.032; Adjusted Cramer's v = 0.12, 95% CI [0.00,
1.00])
# Differences using the Bayes Factor
contingencyTableBF(table(data_sex$sex, data_sex$exp), sampleType = "poisson")Bayes factor analysis
--------------
[1] Non-indep. (a=1) : 3.366873 ±0%
Against denominator:
Null, independence, a = 1
---
Bayes factor type: BFcontingencyTable, poisson
3.4 Years of formal education
3.4.1 Years of formal education (Univariate)
We provide summary statistics and a visual representation of education. The shape of the distribution is then assessed.
# Summary statistics of Formal education (i)
table(df1$scolarita)
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 30 8 9
2 1 2 20 10 29 35 21 59 15 10 5 4 2 1 0 2 2
# Summary statistics of Formal education (ii)
df1$scolarita <- as.numeric(as.character(df1$scolarita))
summary(df1$scolarita) Min. 1st Qu. Median Mean 3rd Qu. Max.
8.0 15.0 17.0 16.6 18.0 24.0
# Summary statistics of Formal education (iii)
round(stat.desc(df1$scolarita, norm = TRUE), 2) nbr.val nbr.null nbr.na min max range
220.00 0.00 0.00 8.00 24.00 16.00
sum median mean SE.mean CI.mean.0.95 var
3651.00 17.00 16.60 0.18 0.35 6.89
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
2.62 0.16 -0.39 -1.20 0.92 1.41
normtest.W normtest.p
0.96 0.00
# Plot
ggplot(df1, aes(x = scolarita)) +
geom_boxplot(width = 0.01, position = position_nudge(y = -0.01),
outlier.alpha = 0, fill = "darkorange", alpha = 0.5) +
geom_density(fill = "darkorange", alpha = 0.3) +
geom_jitter(mapping = aes(x = scolarita, y = -0.02),
position = position_jitter(height = 0.002),
color = "darkorange", alpha = 0.5) +
labs(x = "Years of formal education", y = "Density") +
theme_bw()# Assess distribution shape
descdist(df1$scolarita)summary statistics
------
min: 8 max: 24
median: 17
mean: 16.59545
estimated sd: 2.624955
estimated skewness: -0.3977193
estimated kurtosis: 4.007196
# Fit candidate distributions
fnorm <- fitdist(df1$scolarita, "norm")
fgamma <- fitdist(df1$scolarita, "gamma")
fpois <- fitdist(df1$scolarita, "pois")
# Compare fitted distributions
plot.legend <- c("Normal", "Gamma", "Poisson")
par(mfrow = c(2, 2))
denscomp(list(fnorm, fgamma, fpois), legendtext = plot.legend)
qqcomp(list(fnorm, fgamma, fpois), legendtext = plot.legend)
cdfcomp(list(fnorm, fgamma, fpois), legendtext = plot.legend)
ppcomp(list(fnorm, fgamma, fpois), legendtext = plot.legend)# Goodness-of-fit statistics
gofstat(list(fnorm, fgamma, fpois), fitnames = plot.legend)Goodness-of-fit statistics
Normal Gamma Poisson
Kolmogorov-Smirnov statistic 0.140486 0.1458965 0.2316732
Cramer-von Mises statistic 0.629528 0.8088191 2.9250835
Anderson-Darling statistic 3.393844 4.4808730 15.7608110
Goodness-of-fit criteria
Normal Gamma Poisson
Akaike's Information Criterion 1051.959 1069.030 1118.188
Bayesian Information Criterion 1058.746 1075.818 1121.582
We created a new categorical variable, education_level, derived from participants’ reported years of schooling (scolarita). This classification captures the most common academic milestones in ascending order. Participants with 5 to 8 years of schooling were categorized as having completed Primary School. Those with 9 to 13 years were assigned to Secondary School. Between 14 and 16 years corresponds to High School. Participants with 17 to 18 years were classified as having a Bachelor’s Degree, while those with more than 18 years of education were grouped under Master’s Degree and more. This variable provides a more interpretable summary of the sample’s educational background according to Italian law.
# Create education level variable
library(dplyr)
df1 <- df1 %>%
mutate(education_level = case_when(
scolarita > 18 ~ "5.Master’s Degree and more",
scolarita > 16 ~ "4.Bachelor’s Degree",
scolarita > 13 ~ "3.High School",
scolarita > 8 ~ "2.Secondary School",
scolarita > 5 ~ "1.Primary School",
TRUE ~ "0.No Primary School"
))
df1$education_level <- as.factor(df1$education_level)
# Frequencies table
table(df1$education_level)
1.Primary School 2.Secondary School
2 27
3.High School 4.Bachelor’s Degree
74 80
5.Master’s Degree and more
37
# Plot
ggplot(df1, aes(x = education_level, fill = education_level)) +
geom_bar(alpha = 0.8) +
scale_fill_viridis_d(option = "plasma") +
labs(x = "Education Level", y = "Number of Participants",
fill = "Education Level") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))We also asked participants whether they were currently still pursuing formal education.
# Frequencies table
table(df1$studi)
No Sì
119 101
# Combining with educational level and describing frequencies
table_study_edu <- table(df1$education_level, df1$studi)
table_study_edu
No Sì
1.Primary School 2 0
2.Secondary School 24 3
3.High School 32 42
4.Bachelor’s Degree 45 35
5.Master’s Degree and more 16 21
# Bar plot
library(tidyr)
df_study_edu <- as.data.frame(table_study_edu) %>%
rename(Education = Var1, StudyStatus = Var2, Count = Freq)
ggplot(df_study_edu, aes(x = Education, y = Count, fill = StudyStatus)) +
geom_bar(stat = "identity", position = position_dodge()) +
scale_fill_viridis_d(option = "viridis", begin = 0.2, end = 0.4) +
labs(
x = "Education Level",
y = "Number of Participants",
fill = "Currently Studying"
) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top"
)3.4.2 Years of formal education between groups
To explore differences in years of formal education between participant sources (Prolific vs. University) and conditions (1 vs. 2), we began by visualizing and summarizing descriptive statistics of education years across groups. We then fitted a series of generalized linear models (GLMs). These models included various combinations of predictors (participant, condition, and their interaction). To determine the most appropriate model, we used model selection criteria including the Akaike Information Criterion (AIC) and the Bayesian Information Criterion (BIC). Finally, we visualized and interpreted the selected model to understand how participant source and experimental condition predict years of formal education.
# Descriptive statistics for participants (i)
by(df1$scolarita, df1$participant, summary)df1$participant: P
Min. 1st Qu. Median Mean 3rd Qu. Max.
8.00 16.00 17.00 16.84 18.00 24.00
------------------------------------------------------------
df1$participant: U
Min. 1st Qu. Median Mean 3rd Qu. Max.
13.00 15.00 15.00 15.45 15.75 20.00
# Descriptive statistics for conditions (i)
by(df1$scolarita, df1$condition, summary)df1$condition: 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
10.00 15.00 17.00 16.84 18.00 24.00
------------------------------------------------------------
df1$condition: 2
Min. 1st Qu. Median Mean 3rd Qu. Max.
8.00 15.00 16.50 16.35 18.00 23.00
# Descriptive statistics for participants and conditions (i)
by(df1$scolarita, list(df1$participant, df1$condition), summary): P
: 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
10.00 16.00 18.00 17.24 18.00 24.00
------------------------------------------------------------
: U
: 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
13.00 15.00 15.00 15.48 16.00 19.00
------------------------------------------------------------
: P
: 2
Min. 1st Qu. Median Mean 3rd Qu. Max.
8.00 15.00 17.00 16.48 18.00 23.00
------------------------------------------------------------
: U
: 2
Min. 1st Qu. Median Mean 3rd Qu. Max.
13.00 15.00 15.00 15.38 15.00 20.00
# Descriptive statistics for participants (i)
by(df1$scolarita, df1$participant, DESC)df1$participant: P
nbr.val nbr.null nbr.na min max range
182.00 0.00 0.00 8.00 24.00 16.00
sum median mean SE.mean CI.mean.0.95 var
3064.00 17.00 16.84 0.20 0.40 7.34
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
2.71 0.16 -0.62 -1.71 1.21 1.69
normtest.W normtest.p
0.94 0.00
------------------------------------------------------------
df1$participant: U
nbr.val nbr.null nbr.na min max range
38.00 0.00 0.00 13.00 20.00 7.00
sum median mean SE.mean CI.mean.0.95 var
587.00 15.00 15.45 0.29 0.59 3.23
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
1.80 0.12 0.83 1.09 -0.08 -0.06
normtest.W normtest.p
0.83 0.00
# Descriptive statistics for conditions (i)
by(df1$scolarita, df1$condition, DESC)df1$condition: 1
nbr.val nbr.null nbr.na min max range
110.00 0.00 0.00 10.00 24.00 14.00
sum median mean SE.mean CI.mean.0.95 var
1852.00 17.00 16.84 0.22 0.43 5.29
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
2.30 0.14 0.11 0.25 0.76 0.83
normtest.W normtest.p
0.96 0.00
------------------------------------------------------------
df1$condition: 2
nbr.val nbr.null nbr.na min max range
110.00 0.00 0.00 8.00 23.00 15.00
sum median mean SE.mean CI.mean.0.95 var
1799.00 16.50 16.35 0.28 0.55 8.43
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
2.90 0.18 -0.54 -1.18 0.49 0.53
normtest.W normtest.p
0.96 0.00
# Descriptive statistics for participants and conditions (ii)
by(df1$scolarita, list(df1$participant, df1$condition), DESC): P
: 1
nbr.val nbr.null nbr.na min max range
85.00 0.00 0.00 10.00 24.00 14.00
sum median mean SE.mean CI.mean.0.95 var
1465.00 18.00 17.24 0.25 0.49 5.18
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
2.28 0.13 -0.06 -0.12 1.42 1.38
normtest.W normtest.p
0.94 0.00
------------------------------------------------------------
: U
: 1
nbr.val nbr.null nbr.na min max range
25.00 0.00 0.00 13.00 19.00 6.00
sum median mean SE.mean CI.mean.0.95 var
387.00 15.00 15.48 0.37 0.76 3.43
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
1.85 0.12 0.52 0.56 -0.85 -0.47
normtest.W normtest.p
0.87 0.00
------------------------------------------------------------
: P
: 2
nbr.val nbr.null nbr.na min max range
97.00 0.00 0.00 8.00 23.00 15.00
sum median mean SE.mean CI.mean.0.95 var
1599.00 17.00 16.48 0.31 0.61 9.04
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
3.01 0.18 -0.68 -1.38 0.50 0.51
normtest.W normtest.p
0.95 0.00
------------------------------------------------------------
: U
: 2
nbr.val nbr.null nbr.na min max range
13.00 0.00 0.00 13.00 20.00 7.00
sum median mean SE.mean CI.mean.0.95 var
200.00 15.00 15.38 0.49 1.06 3.09
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
1.76 0.11 1.41 1.14 1.33 0.56
normtest.W normtest.p
0.70 0.00
# Participants
p1 <- ggplot(df1, aes(x = scolarita)) +
geom_boxplot(width = 0.06, position = position_nudge(y = -0.09),
outlier.alpha = 0, fill = "darkorange", alpha = 0.5) +
geom_density(fill = "darkorange", alpha = 0.3) +
geom_jitter(aes(y = -0.03), position = position_jitter(height = 0.02),
color = "darkorange", alpha = 0.5) +
labs(x = "Years of Education", y = "Density", title = "Participant") +
facet_wrap(~participant, labeller = label_both) +
theme_bw()
# Conditions
p2 <- ggplot(df1, aes(x = scolarita)) +
geom_boxplot(width = 0.01, position = position_nudge(y = -0.02),
outlier.alpha = 0, fill = "steelblue", alpha = 0.5) +
geom_density(fill = "steelblue", alpha = 0.3) +
geom_jitter(aes(y = -0.01), position = position_jitter(height = 0.002),
color = "steelblue", alpha = 0.5) +
labs(x = "Years of Education", y = "Density", title = "Condition") +
facet_wrap(~condition, labeller = label_both) +
theme_bw()
# Participant × condition
p3 <- ggplot(df1, aes(x = scolarita)) +
geom_boxplot(width = 0.04, position = position_nudge(y = -0.05),
outlier.alpha = 0, fill = "darkgreen", alpha = 0.5) +
geom_density(fill = "darkgreen", alpha = 0.3) +
geom_jitter(aes(y = -0.02), position = position_jitter(height = 0.01),
color = "darkgreen", alpha = 0.5) +
labs(x = "Years of Education", y = "Density",
title = "Participant × Condition") +
facet_grid(participant ~ condition, labeller = label_both) +
theme_bw()
# Plots
combined_plot <- p1 / p2 / p3 + plot_layout(guides = "collect") &
theme(legend.position = "none")
combined_plot# Model fitting
# Null model
m0 <- lm(scolarita ~ 1, data = df1)
# Condition model
m1 <- lm(scolarita ~ condition, data = df1)
# Participant model
m2 <- lm(scolarita ~ participant, data = df1)
# Additive
m3 <- lm(scolarita ~ condition + participant, data = df1)
# Interaction
m4 <- lm(scolarita ~ condition * participant, data = df1)
# Model selection
model.sel(m0, m1, m2, m3, m4, rank = AIC)Model selection table
(Int) cnd prt cnd:prt df logLik AIC delta weight
m3 17.18 + + 4 -517.751 1043.5 0.00 0.505
m2 16.84 + 3 -519.475 1044.9 1.45 0.245
m4 17.24 + + + 5 -517.512 1045.0 1.52 0.236
m0 16.60 2 -523.979 1052.0 8.46 0.007
m1 16.84 + 3 -523.045 1052.1 8.59 0.007
Models ranked by AIC(x)
model.sel(m0, m1, m2, m3, m4, rank = BIC)Model selection table
(Int) cnd prt cnd:prt df logLik BIC delta weight
m2 16.84 + 3 -519.475 1055.1 0.00 0.624
m3 17.18 + + 4 -517.751 1057.1 1.95 0.236
m0 16.60 2 -523.979 1058.7 3.62 0.102
m4 17.24 + + + 5 -517.512 1062.0 6.86 0.020
m1 16.84 + 3 -523.045 1062.3 7.14 0.018
Models ranked by BIC(x)
#No significative Condition coefficient
summary(m3)
Call:
lm(formula = scolarita ~ condition + participant, data = df1)
Residuals:
Min 1Q Median 3Q Max
-8.5331 -1.1798 -0.0219 1.4669 6.8202
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 17.1798 0.2660 64.582 < 2e-16 ***
condition2 -0.6467 0.3493 -1.851 0.06548 .
participantU -1.5112 0.4620 -3.271 0.00125 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.563 on 217 degrees of freedom
Multiple R-squared: 0.05505, Adjusted R-squared: 0.04634
F-statistic: 6.321 on 2 and 217 DF, p-value: 0.002148
# Model diagnostics and prediction
check_model(m2) check_predictions(m2)# Final model summary
summary(m2)
Call:
lm(formula = scolarita ~ participant, data = df1)
Residuals:
Min 1Q Median 3Q Max
-8.8352 -0.8352 0.1648 1.1648 7.1648
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 16.8352 0.1911 88.111 < 2e-16 ***
participantU -1.3878 0.4597 -3.019 0.00284 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.578 on 218 degrees of freedom
Multiple R-squared: 0.04012, Adjusted R-squared: 0.03572
F-statistic: 9.112 on 1 and 218 DF, p-value: 0.002841
#Report
report(m2)We fitted a linear model (estimated using OLS) to predict scolarita with
participant (formula: scolarita ~ participant). The model explains a
statistically significant and weak proportion of variance (R2 = 0.04, F(1, 218)
= 9.11, p = 0.003, adj. R2 = 0.04). The model's intercept, corresponding to
participant = P, is at 16.84 (95% CI [16.46, 17.21], t(218) = 88.11, p < .001).
Within this model:
- The effect of participant [U] is statistically significant and negative (beta
= -1.39, 95% CI [-2.29, -0.48], t(218) = -3.02, p = 0.003; Std. beta = -0.53,
95% CI [-0.87, -0.18])
Standardized parameters were obtained by fitting the model on a standardized
version of the dataset. 95% Confidence Intervals (CIs) and p-values were
computed using a Wald t-distribution approximation.
# ES Cohen's d
cohens_d(df1$scolarita[df1$participant=="P"],df1$scolarita[df1$participant=="U"])Cohen's d | 95% CI
------------------------
0.54 | [0.18, 0.89]
- Estimated using pooled SD.
# ES Cliff's Delta
cliffs_delta(df1$scolarita[df1$participant=="P"],df1$scolarita[df1$participant=="U"])r (rank biserial) | 95% CI
--------------------------------
0.40 | [0.22, 0.56]
4. Linguistic features
4.1 Preliminary considerations
Our final sample includes exactly 220 native Italian speakers who are not native English speakers and who do not currently live in countries where English is the predominant language.
# Number of native Italian speakers
sum(df1$linguanativa_i)[1] 220
# Number of native English speakers
sum(df1$linguanativa_e)[1] 0
# Number of participants living abroad in English-speaking countries
sum(df1$estero)[1] 0
4.2 Objective English test
4.2.1 Individual questions
The English objective test consisted of 25 questions, each scored as either incorrect (0) or correct (1). Below, we visualize the distribution of responses across these questions.
# Calculate accuracy proportion per question
accuracy_vec <- colSums(df1[, 47:(47 + 24)]) / 220
knitr::kable(accuracy_vec)| x | |
|---|---|
| PI_1 | 0.9909091 |
| PI_2 | 0.9863636 |
| PI_3 | 0.8909091 |
| PI_4 | 0.8863636 |
| PI_5 | 0.9181818 |
| PI_6 | 0.8454545 |
| PI_7 | 0.8272727 |
| PI_8 | 0.8454545 |
| PI_9 | 0.8318182 |
| PI_10 | 0.9636364 |
| PI_11 | 0.8318182 |
| PI_12 | 0.2454545 |
| PI_13 | 0.9090909 |
| PI_14 | 0.3727273 |
| PI_15 | 0.8954545 |
| PI_16 | 0.7681818 |
| PI_17 | 0.9272727 |
| PI_18 | 0.6272727 |
| PI_19 | 0.6454545 |
| PI_20 | 0.6590909 |
| PI_21 | 0.3909091 |
| PI_22 | 0.6772727 |
| PI_23 | 0.8227273 |
| PI_24 | 0.5909091 |
| PI_25 | 0.5227273 |
# Plot
questions <- paste0("Q", 1:25)
heatmap_data <- data.frame(Question = questions, Accuracy = accuracy_vec)
heatmap_data$Question <- factor(heatmap_data$Question, levels = heatmap_data$Question[order(heatmap_data$Accuracy)])
ggplot(heatmap_data, aes(x = Question, y = Accuracy, fill = Accuracy)) +
geom_col() +
scale_fill_gradient(low = "red3", high = "darkgreen", limits = c(0,1), labels = scales::percent) +
labs(x = "Question", y = "Accuracy") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))To assess whether there is substantial evidence for a difference between the two conditions on the objective test, we begin by graphically inspecting the data, which suggests no notable differences between the two groups. Next, for each of the 25 questions, we compute a 2×2 contingency table comparing the condition (English group vs. Italian group) and the response distribution (1 or 0). For each contingency table, we calculate the Bayes factor to test the assumption of independence between group and response.
# Extract question data (columns 47 to 71)
questions_data <- df1[, 47:(47 + 24)]
conditions <- df1$condition
questions <- colnames(questions_data)
unique_conditions <- unique(conditions)
# Prepare data.frame
accuracy_by_condition <- data.frame()
# Calculate accuracy per question and condition
for (cond in unique_conditions) {
subset_data <- questions_data[conditions == cond, ]
for (q in questions) {
acc <- mean(subset_data[[q]], na.rm = TRUE)
accuracy_by_condition <- rbind(accuracy_by_condition,
data.frame(condition = cond,
Question = q,
Accuracy = acc))
}
}
# Order questions
accuracy_by_condition_ordered <- do.call(rbind,
lapply(split(accuracy_by_condition, accuracy_by_condition$condition), function(df) {
df$Question <- factor(df$Question, levels = df$Question[order(df$Accuracy)])
df[order(df$Accuracy), ]
}))
accuracy_by_condition_ordered$Question <- factor(
accuracy_by_condition_ordered$Question,
levels = unique(accuracy_by_condition_ordered$Question))
# Plot
ggplot(accuracy_by_condition_ordered, aes(x = Question,
y = Accuracy, fill = Accuracy)) +
geom_col() +
scale_fill_gradient(low = "red3", high = "darkgreen", limits = c(0,1),
labels = scales::percent) +
labs(title = "Proportion Correct per English Test Question by Condition",
x = "Question", y = "Accuracy") +
facet_wrap(~condition, scales = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# Applying Bayes Factor
bf_numeric_vec <- numeric(25)
bf_interpretation <- character(25)
questions <- paste0("Q", 1:25)
for (i in 47:(47 + 24)) {
contingency_table <- table(df1[[i]], df1$condition)
bf <- 1/contingencyTableBF(contingency_table,
sampleType = "indepMulti", fixedMargin = "cols")
bf_numeric <- exp(bf@bayesFactor$bf)
bf_numeric_vec[i - 46] <- bf_numeric
bf_interpretation[i - 46] <- ifelse(
bf_numeric > 3, "Supported Equality",
ifelse(bf_numeric < 1/3, "Supported Difference",
ifelse(bf_numeric < 1, "Weak Difference",
"Weak Equality")))
}
bf_df <- data.frame(
Question = questions,
BayesFactor = bf_numeric_vec,
Interpretation = factor(bf_interpretation,
levels = c("Supported Equality",
"Weak Equality",
"Weak Difference",
"Supported Difference")))
colors <- c("Supported Equality" = "darkgreen", "Weak Equality" = "yellow4",
"Weak Difference" = "pink4", "Supported Difference" = "darkred")
#Visualizing results
ggplot(bf_df, aes(x = reorder(Question, BayesFactor), y = BayesFactor)) +
geom_point(aes(color = Interpretation, size = 12)) +
scale_color_manual(values = colors) +
geom_hline(yintercept = 3, linetype = "dashed",
size = 1, color = "orange") +
geom_hline(yintercept = 1/3, linetype = "dashed",
size = 1, color = "orange") +
geom_hline(yintercept = 1, linetype = "dashed", size = 1) +
scale_size_continuous(range = c(3, 8), guide = "none") +
coord_flip() +
labs(x = "Question", y = "Bayes Factor for Independence") +
theme_bw() +
theme(legend.position = "bottom")4.2. Sum scores
The sum score of the objective English test is summarized and visualized.
# Summary statistics (i)
summary(df1$ok_total) Min. 1st Qu. Median Mean 3rd Qu. Max.
7.00 16.00 20.00 18.87 22.00 25.00
# Summary statistics (ii)
round(stat.desc(df1$ok_total, norm = TRUE), 2) nbr.val nbr.null nbr.na min max range
220.00 0.00 0.00 7.00 25.00 18.00
sum median mean SE.mean CI.mean.0.95 var
4152.00 20.00 18.87 0.28 0.54 16.67
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
4.08 0.22 -0.76 -2.33 0.02 0.04
normtest.W normtest.p
0.94 0.00
# Plot
ggplot(df1, aes(x = ok_total)) +
geom_boxplot(width = 0.004, position = position_nudge(y = -0.005),
outlier.alpha = 0, fill = "darkorange", alpha = 0.5) +
geom_density(fill = "darkorange", alpha = 0.3) +
geom_jitter(mapping = aes(x = ok_total, y = -0.01),
position = position_jitter(height = 0.002),
color = "darkorange", alpha = 0.5) +
labs(x = "Sum scores", y = "Density") +
theme_bw()4.3 English diploma
Among the 220 participants, 139 (63.2%) reported not having an English diploma, while 14 (6.4%) had obtained one within the last two years and 67 (30.5%) received theirs more than two years ago. Regarding diploma scores aligned with CEFR and IELTS levels, the most frequently reported level was B2 (5.5–6 IELTS), indicated by 35 participants, followed by 27 participants who reported a C1 level (7–8 IELTS). Fewer participants reported levels B1, A2, or A1, and four individuals either did not recall their score or indicated a score not included in the predefined list. Additionally, three participants provided open-ended responses describing alternative qualifications, including a TOEFL score, a Cambridge Proficiency exam from the 1990s, and a case where the respondent stated they no longer remember the details of their diploma.
# Frequency table of English diploma types
diploma_table <- table(df1$diploma_e)
knitr::kable(diploma_table, caption = "English Diploma")| Var1 | Freq |
|---|---|
| No | 139 |
| Sì, l’ho ottenuta negli ultimi due anni | 14 |
| Sì, l’ho ottenuta più di due anni fa | 67 |
# Frequency table of English diploma scores
score_table <- table(df1$punteggiodiploma)
knitr::kable(score_table, caption = "Distribution of English Diploma Scores")| Var1 | Freq |
|---|---|
| A1 CEFR (2 - 3 IELTS)LL | 1 |
| A2 CEFR (3 - 4 IELTS) | 3 |
| B1 CEFR (4 - 5 IELTS) | 11 |
| B2 CEFR (5.5 - 6 IELTS) | 35 |
| C1 CEFR (7 - 8 IELTS) | 27 |
| La valutazione non è presente in questa lista | 3 |
| Non ricordo la valutazione | 1 |
# Frequency of alternative diploma descriptions
alt_diploma_table <- table(df1$altrodiploma)
knitr::kable(alt_diploma_table, caption = "Other English Diploma Responses")| Var1 | Freq |
|---|---|
| cambridge proficiency nel 1990 penso 93/100 ma non ricordo adesso . parlo Inglese da sempre | 1 |
| Sono passati più di 30 anni, non ricordo davvero | 1 |
| TOEFL - 98 | 1 |
# English translation
open_responses <- data.frame(
Response = c(
"TOEFL - 98 (C1)",
"Cambridge Proficiency 93/100 (Not interpretable)",
"Do not remember"
)
)
knitr::kable(open_responses, caption = "Open-Ended Diploma Responses")| Response |
|---|
| TOEFL - 98 (C1) |
| Cambridge Proficiency 93/100 (Not interpretable) |
| Do not remember |
diploma_presence <- data.frame(
category = c("No Diploma", "Obtained in Last 2 Years", "Obtained >2 Years Ago"),
count = c(139, 14, 67)
)
# Diploma scores
diploma_scores <- data.frame(
level = c("A1 (2–3 IELTS)", "A2 (3–4 IELTS)", "B1 (4–5 IELTS)",
"B2 (5.5–6 IELTS)", "C1 (7–8 IELTS)", "Other/Unknown"),
count = c(1, 3, 11, 35, 27, 4)
)
# Diploma Presence
p1 <- ggplot(diploma_presence, aes(x = reorder(category, -count), y = count)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = count), vjust = -0.5) +
labs(title = "English Diploma Presence", x = NULL,
y = "Number of Participants") +
theme_bw()
# Diploma Scores
p2 <- ggplot(diploma_scores, aes(x = reorder(level, count), y = count)) +
geom_bar(stat = "identity", fill = "darkgreen") +
geom_text(aes(label = count), vjust = -0.5) +
labs(title = "English Diploma Scores (CEFR / IELTS)",
x = NULL, y = "Number of Participants") +
theme_bw()
# Show both plots
library(gridExtra)
grid.arrange(p1, p2, ncol = 1)4.4 Subjective English proficiency
We performed a Confirmatory Factor Analysis to evaluate if the four subjective proficiency measures can be represented by one common latent factor.
# CFA libraries
library(lavaan)
library(semPlot)
library(semTools)
# Model
model_1 <- "Proeng =~ 0+lett_e + comp_e + scri_e + parl_e
lett_e ~~ comp_e
"
# CFA and summary
fit1 <- cfa(model_1, data = df1, std.lv=T, ordered = T, estimator = "DWLS")
summary(fit1, standardized=T, fit.measures= T)lavaan 0.6-19 ended normally after 14 iterations
Estimator DWLS
Optimization method NLMINB
Number of model parameters 34
Number of observations 220
Model Test User Model:
Test statistic 1.637
Degrees of freedom 1
P-value (Chi-square) 0.201
Model Test Baseline Model:
Test statistic 3382.228
Degrees of freedom 6
P-value 0.000
User Model versus Baseline Model:
Comparative Fit Index (CFI) 1.000
Tucker-Lewis Index (TLI) 0.999
Root Mean Square Error of Approximation:
RMSEA 0.054
90 Percent confidence interval - lower 0.000
90 Percent confidence interval - upper 0.198
P-value H_0: RMSEA <= 0.050 0.316
P-value H_0: RMSEA >= 0.080 0.531
Standardized Root Mean Square Residual:
SRMR 0.016
Parameter Estimates:
Parameterization Delta
Standard errors Standard
Information Expected
Information saturated (h1) model Unstructured
Latent Variables:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
Proeng =~
Proeng 0.000 0.000 0.000
lett_e 0.659 0.034 19.265 0.000 0.659 0.659
comp_e 0.733 0.032 23.170 0.000 0.733 0.733
scri_e 0.991 0.037 26.891 0.000 0.991 0.991
parl_e 0.753 0.028 26.891 0.000 0.753 0.753
Covariances:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
.lett_e ~~
.comp_e 0.346 0.043 8.126 0.000 0.346 0.677
Thresholds:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
lett_e|t1 -2.362 0.262 -9.030 0.000 -2.362 -2.362
lett_e|t2 -1.922 0.175 -10.979 0.000 -1.922 -1.922
lett_e|t3 -1.393 0.122 -11.372 0.000 -1.393 -1.393
lett_e|t4 -0.674 0.092 -7.325 0.000 -0.674 -0.674
lett_e|t5 0.172 0.085 2.017 0.044 0.172 0.172
lett_e|t6 1.139 0.108 10.546 0.000 1.139 1.139
comp_e|t1 -2.609 0.342 -7.622 0.000 -2.609 -2.609
comp_e|t2 -2.208 0.225 -9.827 0.000 -2.208 -2.208
comp_e|t3 -1.691 0.147 -11.477 0.000 -1.691 -1.691
comp_e|t4 -1.161 0.109 -10.646 0.000 -1.161 -1.161
comp_e|t5 -0.605 0.091 -6.676 0.000 -0.605 -0.605
comp_e|t6 0.277 0.086 3.225 0.001 0.277 0.277
comp_e|t7 1.118 0.107 10.444 0.000 1.118 1.118
scri_e|t1 -2.609 0.342 -7.622 0.000 -2.609 -2.609
scri_e|t2 -1.795 0.159 -11.311 0.000 -1.795 -1.795
scri_e|t3 -1.525 0.132 -11.530 0.000 -1.525 -1.525
scri_e|t4 -0.998 0.102 -9.790 0.000 -0.998 -0.998
scri_e|t5 -0.349 0.087 -4.028 0.000 -0.349 -0.349
scri_e|t6 0.498 0.089 5.624 0.000 0.498 0.498
scri_e|t7 1.308 0.117 11.172 0.000 1.308 1.308
scri_e|t8 2.093 0.202 10.350 0.000 2.093 2.093
parl_e|t1 -2.093 0.202 -10.350 0.000 -2.093 -2.093
parl_e|t2 -1.525 0.132 -11.530 0.000 -1.525 -1.525
parl_e|t3 -1.118 0.107 -10.444 0.000 -1.118 -1.118
parl_e|t4 -0.733 0.093 -7.838 0.000 -0.733 -0.733
parl_e|t5 -0.195 0.085 -2.286 0.022 -0.195 -0.195
parl_e|t6 0.398 0.087 4.561 0.000 0.398 0.398
parl_e|t7 1.424 0.125 11.425 0.000 1.424 1.424
parl_e|t8 1.922 0.175 10.979 0.000 1.922 1.922
Variances:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
.lett_e 0.566 0.566 0.566
.comp_e 0.462 0.462 0.462
.scri_e 0.018 0.018 0.018
.parl_e 0.433 0.433 0.433
.Proeng 1.000 1.000 1.000
# Model visualization
semPaths(fit1, "stand", layout = "circle", intercepts = F)# Reliability indexes
#Alpha
psych::omega(df1[,c("lett_e" , "comp_e" , "scri_e" , "parl_e")],
plot = F)[3]$alpha
[1] 0.8732555
#Omega
as.numeric(psych::omega(df1[,c("lett_e" , "comp_e" , "scri_e" , "parl_e")],
plot = F)[4])[1] 0.9277055
4.4.1 Subjective English proficiency between experiments
We compared the subjective English proficiency sum scores between the two studies.
# Create sum scores
df1$lett_e + df1$comp_e + df1$scri_e + df1$parl_e -> df1$fl_e
# Combine into a dataframe
group1 <- datacomplete$SelfEvalForeign[datacomplete$experiment == "Experiment2"]
group2 <- df1$fl_e
Prof_data <- data.frame(
Group = c(rep("SelfEvalForeign_Exp2", length(group1)), rep("fl_e", length(group2))),
Value = c(group1, group2))
# Descriptive statistics
by(Prof_data$Value, Prof_data$Group, DESC)Prof_data$Group: fl_e
nbr.val nbr.null nbr.na min max range
220.00 0.00 0.00 12.00 40.00 28.00
sum median mean SE.mean CI.mean.0.95 var
6519.00 30.00 29.63 0.33 0.65 23.66
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
4.86 0.16 -0.65 -1.97 0.67 1.02
normtest.W normtest.p
0.97 0.00
------------------------------------------------------------
Prof_data$Group: SelfEvalForeign_Exp2
nbr.val nbr.null nbr.na min max range
80.00 0.00 0.00 21.00 36.00 15.00
sum median mean SE.mean CI.mean.0.95 var
2317.00 29.50 28.96 0.38 0.75 11.38
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
3.37 0.12 -0.19 -0.36 -0.92 -0.87
normtest.W normtest.p
0.97 0.04
# Descriptive statistics (ii)
by(Prof_data$Value, Prof_data$Group, DESC)Prof_data$Group: fl_e
nbr.val nbr.null nbr.na min max range
220.00 0.00 0.00 12.00 40.00 28.00
sum median mean SE.mean CI.mean.0.95 var
6519.00 30.00 29.63 0.33 0.65 23.66
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
4.86 0.16 -0.65 -1.97 0.67 1.02
normtest.W normtest.p
0.97 0.00
------------------------------------------------------------
Prof_data$Group: SelfEvalForeign_Exp2
nbr.val nbr.null nbr.na min max range
80.00 0.00 0.00 21.00 36.00 15.00
sum median mean SE.mean CI.mean.0.95 var
2317.00 29.50 28.96 0.38 0.75 11.38
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
3.37 0.12 -0.19 -0.36 -0.92 -0.87
normtest.W normtest.p
0.97 0.04
# Visualization
library(ggdist)
ggplot(Prof_data, aes(x = Value, y = Group, fill = Group)) +
stat_halfeye( alpha = 0.6, scale = 0.9,
slab_color = NA) +
theme_bw() +
labs(x = "Value",y = "Subjective evaluation of FL")+
theme(legend.position = "none")# Overlap between the two distributions
library(overlapping)
overlap(list(datacomplete$SelfEvalForeign[
datacomplete$experiment=="Experiment2"], df1$fl_e))$OV
[1] 0.8245524
# ES in Cohen's d
cohens_d(datacomplete$SelfEvalForeign[
datacomplete$experiment=="Experiment2"], df1$fl_e)Cohen's d | 95% CI
-------------------------
-0.15 | [-0.40, 0.11]
- Estimated using pooled SD.
# ES in Cliff's Delta
cliffs_delta(datacomplete$SelfEvalForeign[
datacomplete$experiment=="Experiment2"], df1$fl_e)r (rank biserial) | 95% CI
---------------------------------
-0.13 | [-0.27, 0.02]
# Bayesian t-test for the difference between the two groups
1/ttestBF(datacomplete$SelfEvalForeign[
datacomplete$experiment=="Experiment2"], df1$fl_e)Bayes factor analysis
--------------
[1] Null, mu1-mu2=0 : 3.816409 ±0.05%
Against denominator:
Alternative, r = 0.707106781186548, mu =/= 0
---
Bayes factor type: BFindepSample, JZS
4.5 Proficiency measures together
We calculated the correlation between the proficiency measures and visualized them together.
#Correlation between the objective and the subjective measure
cor(df1$fl_e, df1$ok_total)[1] 0.4899298
# Plot
library(ggExtra)
df1$fl_e <- df1$lett_e + df1$comp_e +df1$scri_e +df1$parl_e
cor(df1$fl_e, df1$ok_total)[1] 0.4899298
df1$punteggiodiploma[is.na(df1$punteggiodiploma)] <- "Z"
df1$punteggiodiploma <- as.factor(df1$punteggiodiploma)
levels(df1$punteggiodiploma) <- c("A1", "A2", "B1",
"B2","C1", "Other",
"Don't remember", "Not given")
df1$fl_e <- df1$lett_e + df1$comp_e +df1$scri_e +df1$parl_e
x<- ggplot(df1, aes(x=ok_total, y= fl_e, colour = punteggiodiploma))+
theme_bw()+theme(legend.position="bottom")+
labs(y="Perceived FL competence",
x="Objective FL competence", colour="FL Diploma")+
geom_point()+scale_colour_okabe_ito()+scale_fill_okabe_ito()
ggMarginal(x, type="hist", groupFill = TRUE, alpha=.6)4.6 Age of Acquisition (AoA)
4.6.1 AOA (Univariate)
We provide summary statistics and a visual representation of AoA. The shape of the distribution is then assessed.
# Summary statistics (i)
summary(df1$AOA_e) Min. 1st Qu. Median Mean 3rd Qu. Max.
3.000 6.000 7.000 8.123 10.000 25.000
# Summary statistics (ii)
round(stat.desc(df1$AOA_e, norm = TRUE), 2) nbr.val nbr.null nbr.na min max range
220.00 0.00 0.00 3.00 25.00 22.00
sum median mean SE.mean CI.mean.0.95 var
1787.00 7.00 8.12 0.22 0.43 10.38
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
3.22 0.40 1.63 4.96 3.87 5.92
normtest.W normtest.p
0.85 0.00
# Plot
ggplot(df1, aes(x = AOA_e)) +
geom_boxplot(width = 0.004, position = position_nudge(y = -0.005),
outlier.alpha = 0, fill = "darkorange", alpha = 0.5) +
geom_density(fill = "darkorange", alpha = 0.3) +
geom_jitter(mapping = aes(x = AOA_e, y = -0.01),
position = position_jitter(height = 0.002),
color = "darkorange", alpha = 0.5) +
labs(x = "AoA", y = "Density") +
theme_bw()# Assess distribution
library(fitdistrplus)
descdist(df1$AOA_e)summary statistics
------
min: 3 max: 25
median: 7
mean: 8.122727
estimated sd: 3.222131
estimated skewness: 1.65045
estimated kurtosis: 7.046563
# Fit distributions
fnorm <- fitdist(df1$AOA_e, "norm")
fgamma <- fitdist(df1$AOA_e, "gamma")
fpois <- fitdist(df1$AOA_e, "pois")
# Compare distributions
plot.legend <- c("Normal", "Gamma", "Poisson")
par(mfrow = c(2, 2))
denscomp(list(fnorm, fgamma, fpois), legendtext = plot.legend)
qqcomp(list(fnorm, fgamma, fpois), legendtext = plot.legend)
cdfcomp(list(fnorm, fgamma, fpois), legendtext = plot.legend)
ppcomp(list(fnorm, fgamma, fpois), legendtext = plot.legend)# Goodness-of-fit statistics
gofstat(list(fnorm, fgamma, fpois), fitnames = plot.legend)Goodness-of-fit statistics
Normal Gamma Poisson
Kolmogorov-Smirnov statistic 0.2015898 0.1980539 0.1758884
Cramer-von Mises statistic 1.9270677 1.3533959 1.2442285
Anderson-Darling statistic 10.4639845 7.1647164 8.5907921
Goodness-of-fit criteria
Normal Gamma Poisson
Akaike's Information Criterion 1142.150 1078.648 1105.719
Bayesian Information Criterion 1148.937 1085.435 1109.113
4.6.2 AOA difference between groups
To explore AoA differences between participant (Prolific vs. University) and groups (1 vs. 2), we began by visualizing and summarizing descriptive statistics of AoA across groups. We then fitted a series of generalized linear models (GLMs). These models included different combinations of predictors (participant, condition, and their interaction). We used model selection criteria – Akaike Information Criterion (AIC) and Bayesian Information Criterion (BIC) – to identify the best-fitting model. Finally, we visualized and interpreted the selected model.
# Descriptive stats
# AOA by participant (i)
by(df1$AOA_e, df1$participant, function(x) summary(x)) df1$participant: P
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.000 6.000 8.000 8.489 10.000 25.000
------------------------------------------------------------
df1$participant: U
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.000 6.000 6.000 6.368 7.000 10.000
# AOA by condition (i)
by(df1$AOA_e, df1$condition, function(x) summary(x))df1$condition: 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.000 6.000 7.000 8.055 10.000 25.000
------------------------------------------------------------
df1$condition: 2
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.000 6.000 7.000 8.191 10.000 20.000
# AOA by both (i)
by(df1$AOA_e, list(df1$participant, df1$condition), function(x) summary(x)) : P
: 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.000 6.000 8.000 8.482 10.000 25.000
------------------------------------------------------------
: U
: 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.0 6.0 6.0 6.6 7.0 10.0
------------------------------------------------------------
: P
: 2
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.000 6.000 7.000 8.495 11.000 20.000
------------------------------------------------------------
: U
: 2
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.000 5.000 6.000 5.923 6.000 8.000
# AOA by participant (ii)
by(df1$AOA_e, df1$participant, DESC)df1$participant: P
nbr.val nbr.null nbr.na min max range
182.00 0.00 0.00 3.00 25.00 22.00
sum median mean SE.mean CI.mean.0.95 var
1545.00 8.00 8.49 0.25 0.50 11.48
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
3.39 0.40 1.43 3.96 3.06 4.27
normtest.W normtest.p
0.88 0.00
------------------------------------------------------------
df1$participant: U
nbr.val nbr.null nbr.na min max range
38.00 0.00 0.00 5.00 10.00 5.00
sum median mean SE.mean CI.mean.0.95 var
242.00 6.00 6.37 0.20 0.40 1.48
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
1.22 0.19 1.39 1.82 1.81 1.21
normtest.W normtest.p
0.79 0.00
# AOA by condition (ii)
by(df1$AOA_e, df1$condition, DESC)df1$condition: 1
nbr.val nbr.null nbr.na min max range
110.00 0.00 0.00 3.00 25.00 22.00
sum median mean SE.mean CI.mean.0.95 var
886.00 7.00 8.05 0.30 0.59 9.78
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
3.13 0.39 2.21 4.79 7.82 8.55
normtest.W normtest.p
0.80 0.00
------------------------------------------------------------
df1$condition: 2
nbr.val nbr.null nbr.na min max range
110.00 0.00 0.00 4.00 20.00 16.00
sum median mean SE.mean CI.mean.0.95 var
901.00 7.00 8.19 0.32 0.63 11.07
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
3.33 0.41 1.12 2.43 0.69 0.75
normtest.W normtest.p
0.87 0.00
# AOA by both (ii)
by(df1$AOA_e, list(df1$participant, df1$condition), DESC): P
: 1
nbr.val nbr.null nbr.na min max range
85.00 0.00 0.00 3.00 25.00 22.00
sum median mean SE.mean CI.mean.0.95 var
721.00 8.00 8.48 0.37 0.73 11.37
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
3.37 0.40 1.96 3.75 6.21 6.01
normtest.W normtest.p
0.83 0.00
------------------------------------------------------------
: U
: 1
nbr.val nbr.null nbr.na min max range
25.00 0.00 0.00 5.00 10.00 5.00
sum median mean SE.mean CI.mean.0.95 var
165.00 6.00 6.60 0.26 0.55 1.75
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
1.32 0.20 1.24 1.34 0.92 0.51
normtest.W normtest.p
0.80 0.00
------------------------------------------------------------
: P
: 2
nbr.val nbr.null nbr.na min max range
97.00 0.00 0.00 4.00 20.00 16.00
sum median mean SE.mean CI.mean.0.95 var
824.00 7.00 8.49 0.35 0.69 11.69
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
3.42 0.40 0.96 1.95 0.32 0.33
normtest.W normtest.p
0.89 0.00
------------------------------------------------------------
: U
: 2
nbr.val nbr.null nbr.na min max range
13.00 0.00 0.00 5.00 8.00 3.00
sum median mean SE.mean CI.mean.0.95 var
77.00 6.00 5.92 0.24 0.52 0.74
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
0.86 0.15 0.85 0.69 0.18 0.08
normtest.W normtest.p
0.81 0.01
# Participant
p1 <- ggplot(df1, aes(x = AOA_e)) +
geom_boxplot(width = 0.02, position = position_nudge(y = -0.03),
outlier.alpha = 0, fill = "darkorange", alpha = 0.5) +
geom_density(fill = "darkorange", alpha = 0.3) +
geom_jitter(aes(y = -0.01),
position = position_jitter(height = 0.004),
color = "darkorange", alpha = 0.5) +
labs(x = "AoA", y = "Density", title = "Participants") +
facet_wrap(~participant, labeller = label_both) +
theme_bw()
# Condition
p2 <- ggplot(df1, aes(x = AOA_e)) +
geom_boxplot(width = 0.004, position = position_nudge(y = -0.007),
outlier.alpha = 0, fill = "steelblue", alpha = 0.5) +
geom_density(fill = "steelblue", alpha = 0.3) +
geom_jitter(aes(y = -0.003),
position = position_jitter(height = 0.001),
color = "steelblue", alpha = 0.5) +
labs(x = "AoA", y = "Density", title = "Condition") +
facet_wrap(~condition, labeller = label_both) +
theme_bw()
# Participant × Condition
p3 <- ggplot(df1, aes(x = AOA_e)) +
geom_boxplot(width = 0.02, position = position_nudge(y = -0.03),
outlier.alpha = 0, fill = "darkgreen", alpha = 0.5) +
geom_density(fill = "darkgreen", alpha = 0.3) +
geom_jitter(aes(y = -0.01),
position = position_jitter(height = 0.002),
color = "darkgreen", alpha = 0.5) +
labs(x = "AoA", y = "Density", title = "Participant × Condition") +
facet_grid(participant ~ condition, labeller = label_both) +
theme_bw()
# Combining the plots together
combined_plot <- p1 / p2 / p3 + plot_layout(guides = "collect") &
theme(legend.position = "none")
combined_plot#Models comparison with gamma (link function identity)
# Null model
m0 <- glm(AOA_e ~ 1, data = df1, family = Gamma(link = "identity"))
# Condition
m1 <- glm(AOA_e ~ condition, data = df1, family = Gamma(link = "identity"))
# Participant
m2 <- glm(AOA_e ~ participant, data = df1, family = Gamma(link = "identity"))
# Additive model
m3 <- glm(AOA_e ~ condition + participant, data = df1,
family = Gamma(link = "identity"))
# Interaction model
m4 <- glm(AOA_e ~ condition * participant, data = df1,
family = Gamma(link = "identity"))
# Model selection (AIC, BIC)
model.sel(m0, m1, m2, m3, m4, rank = AIC)Model selection table
(Int) cnd prt cnd:prt df logLik AIC delta weight
m2 8.489 + 3 -527.422 1060.8 0.00 0.621
m3 8.578 + + 4 -527.328 1062.7 1.81 0.251
m4 8.482 + + + 5 -527.000 1064.0 3.16 0.128
m0 8.123 2 -537.349 1078.7 17.85 0.000
m1 8.055 + 3 -537.289 1080.6 19.73 0.000
Models ranked by AIC(x)
model.sel(m0, m1, m2, m3, m4, rank = BIC)Model selection table
(Int) cnd prt cnd:prt df logLik BIC delta weight
m2 8.489 + 3 -527.422 1071.0 0.00 0.924
m3 8.578 + + 4 -527.328 1076.2 5.21 0.068
m4 8.482 + + + 5 -527.000 1081.0 9.94 0.006
m0 8.123 2 -537.349 1085.5 14.46 0.001
m1 8.055 + 3 -537.289 1090.8 19.73 0.000
Models ranked by BIC(x)
# Model diagnostics
check_model(m2) # Model predictive check
check_predictions(m2)# Model summary
summary(m2)
Call:
glm(formula = AOA_e ~ participant, family = Gamma(link = "identity"),
data = df1)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.4890 0.2341 36.258 < 2e-16 ***
participantU -2.1206 0.4501 -4.712 4.38e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Gamma family taken to be 0.1384438)
Null deviance: 28.805 on 219 degrees of freedom
Residual deviance: 26.367 on 218 degrees of freedom
AIC: 1060.8
Number of Fisher Scoring iterations: 3
# ES Cohen's d
cohens_d(df1$AOA_e[df1$participant=="P"],df1$AOA_e[df1$participant=="U"])Cohen's d | 95% CI
------------------------
0.68 | [0.32, 1.03]
- Estimated using pooled SD.
# ES Cliff's Delta
cliffs_delta(df1$AOA_e[df1$participant=="P"],df1$AOA_e[df1$participant=="U"])r (rank biserial) | 95% CI
--------------------------------
0.40 | [0.22, 0.56]
# Report results
report(m2)We fitted a general linear model (Gamma family with a identity link) (estimated
using ML) to predict AOA_e with participant (formula: AOA_e ~ participant). The
model's explanatory power is weak (Nagelkerke's R2 = 0.09). The model's
intercept, corresponding to participant = P, is at 8.49 (95% CI [8.05, 8.96],
t(218) = 36.26, p < .001). Within this model:
- The effect of participant [U] is statistically significant and negative (beta
= -2.12, 95% CI [-2.97, -1.19], t(218) = -4.71, p < .001; Std. beta = -2.12,
95% CI [-2.97, -1.19])
Standardized parameters were obtained by fitting the model on a standardized
version of the dataset. 95% Confidence Intervals (CIs) and p-values were
computed using a Wald t-distribution approximation.
4.6.3 AOA differences between experiments
We compared the AoA in our study with that of the original study.
# Descriptive statistics of the AoA in the original experiment (2) i
summary(datacomplete$AoAForeign[datacomplete$experiment=="Experiment2"]) Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000 5.000 6.000 6.575 7.000 35.000
# Descriptive statistics of the AoA in the original experiment (2) ii
round(stat.desc(datacomplete$AoAForeign[
datacomplete$experiment=="Experiment2"], norm= T),2) nbr.val nbr.null nbr.na min max range
80.00 0.00 0.00 2.00 35.00 33.00
sum median mean SE.mean CI.mean.0.95 var
526.00 6.00 6.58 0.48 0.96 18.65
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
4.32 0.66 3.87 7.19 21.47 20.19
normtest.W normtest.p
0.65 0.00
# Descriptive statistics of the age in the original experiment (1) i
summary(datacomplete$AoAForeign[datacomplete$experiment=="Experiment1"]) Min. 1st Qu. Median Mean 3rd Qu. Max.
4.00 10.75 13.50 12.61 15.00 20.00
# Descriptive statistics of the AoA in the original experiment (1) ii
round(stat.desc(datacomplete$AoAForeign[
datacomplete$experiment=="Experiment1"], norm= T),2) nbr.val nbr.null nbr.na min max range
36.00 0.00 0.00 4.00 20.00 16.00
sum median mean SE.mean CI.mean.0.95 var
454.00 13.50 12.61 0.68 1.38 16.64
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
4.08 0.32 -0.39 -0.50 -0.36 -0.23
normtest.W normtest.p
0.94 0.04
# Aggregating data
dfx <-
data.frame(
y = c(
datacomplete$AoAForeign[datacomplete$experiment=="Experiment1"],
datacomplete$AoAForeign[datacomplete$experiment=="Experiment2"],
df1$AOA_e),
x = factor(c(
rep("1.Or_E1",length=length(datacomplete$AoAForeign[
datacomplete$experiment=="Experiment1"])),
rep("3.Or_E2",length=length(datacomplete$AoAForeign[
datacomplete$experiment=="Experiment2"])),
rep("2. Ours",length=length(df1$AOA_e))))
)
# Visualization
ggplot(dfx, aes(x = x, y = y, fill = x)) +
stat_halfeye( alpha = 0.6, scale = 0.9,
slab_color = NA) +
theme_bw() +
labs(x = "Experiment",y = "AoA")+
theme(legend.position = "none")# Using sliding difference contrast and running brms linear model
contrasts(dfx$x) <- MASS::contr.sdif(3)
mod <- brms::brm(y~x, dfx, silent = 2)# Show model results
plot(mod)# Showing ROPE
bayestestR::rope(mod)# Proportion of samples inside the ROPE [-0.40, 0.40]:
Parameter | inside ROPE
-----------------------
Intercept | 0.00 %
x2M1 | 0.00 %
x3M2 | 0.00 %
5 Main Confirmative Hypothesis
5.1 Illusion of causality (univariate)
We provide summary statistics and a visual representation of causality judgments. The shape of the distribution is then assessed.
# Summary statistics (i)
summary(df1$causalita) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 43.00 63.50 57.69 75.00 97.00
# Summary statistics (ii)
round(stat.desc(df1$causalita, norm = TRUE), 2) nbr.val nbr.null nbr.na min max range
220.00 1.00 0.00 0.00 97.00 97.00
sum median mean SE.mean CI.mean.0.95 var
12691.00 63.50 57.69 1.53 3.01 514.49
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
22.68 0.39 -0.62 -1.89 -0.40 -0.61
normtest.W normtest.p
0.95 0.00
# Causality plot
ggplot(df1, aes(x = causalita)) +
geom_boxplot(width = 0.004, position = position_nudge(y = -0.005),
outlier.alpha = 0, fill = "darkorange", alpha = 0.5) +
geom_density(fill = "darkorange", alpha = 0.3) +
geom_jitter(mapping = aes(x = causalita, y = -0.01),
position = position_jitter(height = 0.002),
color = "darkorange", alpha = 0.5) +
labs(x = "Age", y = "Density") +
theme_bw()# Assess distribution
descdist(df1$causalita, discrete=T)summary statistics
------
min: 0 max: 97
median: 63.5
mean: 57.68636
estimated sd: 22.68238
estimated skewness: -0.6292731
estimated kurtosis: 2.642964
# Fit distributions
fnorm <- fitdist(df1$causalita, "norm")
fpois <- fitdist(df1$causalita, "pois")
# Compare distributions
plot.legend <- c("Normal", "Poisson")
par(mfrow = c(2, 2))
denscomp(list(fnorm, fpois), legendtext = plot.legend)
qqcomp(list(fnorm, fpois), legendtext = plot.legend)
cdfcomp(list(fnorm, fpois), legendtext = plot.legend)
ppcomp(list(fnorm, fpois), legendtext = plot.legend)# Goodness-of-fit statistics
gofstat(list(fnorm,fpois), fitnames = plot.legend)Goodness-of-fit statistics
Normal Poisson
Kolmogorov-Smirnov statistic 0.1130812 0.3344536
Cramer-von Mises statistic 0.5172244 8.2584053
Anderson-Darling statistic 3.2322438 196.1548230
Goodness-of-fit criteria
Normal Poisson
Akaike's Information Criterion 2000.830 3647.682
Bayesian Information Criterion 2007.617 3651.076
5.2 Difference between the two groups (descriptive)
We visualize descriptive statistics illustrating the difference between the two groups for our main hypothesis.
# Descriptive statistics
aggregate(df1$causalita, list(df1$condition),
function(x) round(stat.desc(x), 2)) Group.1 x.nbr.val x.nbr.null x.nbr.na x.min x.max x.range x.sum
1 1 110.00 1.00 0.00 0.00 93.00 93.00 6134.00
2 2 110.00 0.00 0.00 3.00 97.00 94.00 6557.00
x.median x.mean x.SE.mean x.CI.mean.0.95 x.var x.std.dev x.coef.var
1 60.00 55.76 2.13 4.23 500.97 22.38 0.40
2 65.00 59.61 2.19 4.33 525.27 22.92 0.38
# Summary statistics
aggregate(df1$causalita, list(df1$condition),
function(x) round(summary(x), 2)) Group.1 x.Min. x.1st Qu. x.Median x.Mean x.3rd Qu. x.Max.
1 1 0.00 40.00 60.00 55.76 73.75 93.00
2 2 3.00 50.00 65.00 59.61 76.50 97.00
# Calculate Cohen's d effect size between conditions
library(effectsize)
cohens_d(df1$causalita[df1$condition == 2], df1$causalita[df1$condition == 1],
pooled_sd = TRUE)[[1]][1] 0.1697612
# Visualization
ggplot(df1, aes(x = causalita, y = condition)) +
stat_halfeye(aes(fill = factor(condition), colour = NA),
alpha = 0.6, scale = 0.5, size = 5, adjust = 0.9, .width = 0,
point_colour = NA, justification = -0.15) +
stat_dots(aes(color = factor(condition), fill = factor(condition)),
side = "bottom", dotsize = 1.2, binwidth = 1,
position = position_nudge(y = -0.09), alpha = 0.8) +
geom_boxplot(aes(fill = factor(condition)),
width = 0.12, outlier.color = NA, alpha = 0.5) +
coord_flip() +
theme_bw() +
labs(title = "Confirmative Hypothesis Descriptive Graphic",
x = "Judged Causal Strength", y = "Group Condition") +
scale_fill_manual(values = c("#D55E00", "#0072B2")) +
scale_color_manual(values = c("#D55E00", "#0072B2")) +
theme(legend.position = "none",
text = element_text(size = 16))5.3 Difference between the two groups (inferential)
5.3.1 Default prior
We calculated the Bayes factor using the default Cauchy prior.
#BF of the main hypothesis
1/ttestBF(df1$causalita[df1$condition==1], df1$causalita[df1$condition==2],
nullInterval= c(0, Inf)) denominator
numerator Alt., r=0.707 0<d<Inf Alt., r=0.707 !(0<d<Inf)
Null, mu1-mu2=0 14.47048 1.81623
5.3.2 Informed prior
We calculated the Bayes factor using the customized prior.
# STAN model specification under H_1
library(rstan)
Stan_Code_H1 <- '
functions{
real funzione_delta(real delta){
return (10/7)*((1 / (1 + exp(-65 * (delta - 0.2))) + 1 /
(1 + exp(65 * (delta - 0.9))))-1);
}
}
data {
int<lower=1> N1; // number of observations for the first group
int<lower=1> N2; // number of observations for the second group
vector[N1] y1; // Dependent variable for the first group
vector[N2] y2; // Dependent variable for the second group
}
parameters {
real <lower=0, upper=100> mu; // overall mean
real delta; // Cohens d
real<lower=0> sigma_2;
// standard deviation for both groups (assumed to be the same)
}
model {
// Defining the priors
target += log(1/(sigma_2)^2); // prior for the variance
// (Jeffreys non-informative prior)
target += log(funzione_delta(delta)); // prior for Cohens d
// Data model
target += normal_lpdf(y1 | mu + (delta * sigma_2 / 2), sigma_2);
target += normal_lpdf(y2 | mu - (delta * sigma_2 / 2), sigma_2);
}
'
# STAN model specification under H_0
Stan_Code_H0 <- '
data {
int<lower=1> N1; // number of observations for the first group
int<lower=1> N2; // number of observations for the second group
vector[N1] y1; // Dependent variable for the first group
vector[N2] y2; // Dependent variable for the second group
}
parameters {
real <lower=0, upper=100> mu; // overall mean
real<lower=0> sigma_2; // standard deviation for both groups
}
model {
// Prior
target += log(1/(sigma_2)^2); // prior for the variance
//(Jeffreys non-informative prior)
// Data model
target += normal_lpdf(y1 | mu, sigma_2);
target += normal_lpdf(y2 | mu, sigma_2);
}
'
# STAN models
stan_M_H1 <- stan_model(model_code = Stan_Code_H1, model_name="stanmodel")
stan_M_H0 <- stan_model(model_code = Stan_Code_H0, model_name="stanmodel")
stan_Fit_H1 <- sampling(stan_M_H1,
data = list(y2 = df1$causalita[df1$condition==2],
y1 = df1$causalita[df1$condition==1],
N1 = 110,
N2 = 110),
iter = 20000, warmup = 500, chains = 4, cores = 4,
verbose =F)
stan_Fit_H0 <- sampling(stan_M_H0,
data = list(y2 = df1$causalita[df1$condition==2],
y1 = df1$causalita[df1$condition==1],
N1 = 110,
N2 = 110),
iter = 20000, warmup = 500, chains = 4, cores = 4,
verbose =F)
# Computing the Bayes Factor
library(bridgesampling)
bayes_factor(
bridge_sampler(stan_Fit_H0),
bridge_sampler(stan_Fit_H1),
) -> bfbfEstimated Bayes factor in favor of x1 over x2: 366.20460
5.3.2 Models including demographic features
To further explore the absence of confirmatory results – and given that demographic characteristics were the primary differences between our sample and that of the original eperiment – we investigated whether the observed null effect could be attributed to potential confounding demographic variables via Bayesian general linear model comparisons
# Run all combinations of predictors including main effects and interactions
bf_models <- generalTestBF(
formula = causalita ~ sesso * eta * scolarita * condition,
data = df1,
whichModels = "withmain",
neverExclude = NULL,
progress = FALSE
)
# View models with BF > 1
as.vector(bf_models)[which(as.vector(bf_models)>1)] sesso
2.477658
scolarita
4.396853
sesso + scolarita
8.060276
eta + scolarita
1.173413
sesso + eta + scolarita
2.966472
sesso + scolarita + sesso:scolarita
1.791553
sesso + condition
1.186153
scolarita + condition
1.045320
sesso + scolarita + condition
2.938560
sesso + eta + scolarita + condition
1.094764
sesso + scolarita + condition + sesso:condition
1.122820
#Assessing the best model
mxx <- brms::brm(causalita ~ scolarita + sesso, silent =2, df1)# Summary
mxx Family: gaussian
Links: mu = identity; sigma = identity
Formula: causalita ~ scolarita + sesso
Data: df1 (Number of observations: 220)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 85.41 9.68 66.23 104.22 1.00 4622 2948
scolarita -1.44 0.58 -2.59 -0.31 1.00 4525 2867
sessoMaschio -6.95 3.01 -12.94 -1.15 1.00 4817 2885
Further Distributional Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 22.20 1.05 20.34 24.39 1.00 4111 3036
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
# Plot results
plot(mxx)# Check predictions
check_predictions(mxx)6 Exploratory analyses
6.1 Process fluency hypothesis
6.1.1 Process fluency hypothesis (descriptive)
We provide summary statistics and a visual representation of process fluency. We visualize descriptive statistics illustrating the difference between the two groups.
# Summary statistics (univariate i)
summary(df1$pf) Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 1.000 2.000 2.327 3.000 7.000
# Summary statistics (univariate ii)
round(stat.desc(df1$pf, norm = TRUE), 2) nbr.val nbr.null nbr.na min max range
220.00 0.00 0.00 1.00 7.00 6.00
sum median mean SE.mean CI.mean.0.95 var
512.00 2.00 2.33 0.11 0.21 2.58
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
1.61 0.69 1.26 3.84 0.68 1.04
normtest.W normtest.p
0.79 0.00
# Summary statistics (with condition i)
aggregate(df1$pf, list(df1$condition), function(x) round(stat.desc(x),2)) Group.1 x.nbr.val x.nbr.null x.nbr.na x.min x.max x.range x.sum x.median
1 1 110.00 0.00 0.00 1.00 7.00 6.00 252.00 2.00
2 2 110.00 0.00 0.00 1.00 7.00 6.00 260.00 2.00
x.mean x.SE.mean x.CI.mean.0.95 x.var x.std.dev x.coef.var
1 2.29 0.16 0.31 2.78 1.67 0.73
2 2.36 0.15 0.29 2.40 1.55 0.66
# Summary statistics (with condition ii)
aggregate(df1$pf, list(df1$condition), function(x) round(summary(x),2)) Group.1 x.Min. x.1st Qu. x.Median x.Mean x.3rd Qu. x.Max.
1 1 1.00 1.00 2.00 2.29 3.00 7.00
2 2 1.00 1.00 2.00 2.36 3.00 7.00
# Effect sizes (Cliff's Delta)
cliffs_delta(df1$pf[df1$condition==2], df1$pf[df1$condition==1],
pooled_sd = T)[[1]][1] 0.07404959
#Effect sizes (Cohen's d)
cohens_d(df1$pf[df1$condition==2], df1$pf[df1$condition==1],
pooled_sd = T)[[1]][1] 0.04520953
# Process fluency plot
ggplot(df1, aes(x = pf)) +
geom_histogram(width = 0.004,
outlier.alpha = 0, fill = "darkorange", alpha = 0.5) +
labs(x = "Process fluency", y = "Frequencies") +
theme_bw() -> a
ggplot(df1, aes(x = pf, fill=condition)) +
geom_histogram(width = 0.004, position = position_dodge(),
outlier.alpha = 0, alpha = 0.5) +
labs(x = "Process fluency", y = "Frequencies") +
theme_bw() -> b
a+b6.1.2 Process fluency hypothesis (Inferential)
We explore if there are any differences between the two groups. We use a Bayes Factor to assess the difference between the two groups.
# Bayes Factor
1/ttestBF(df1$pf[df1$condition==2], df1$pf[df1$condition==1],
nullInterval = c(0, Inf)) denominator
numerator Alt., r=0.707 0<d<Inf Alt., r=0.707 !(0<d<Inf)
Null, mu1-mu2=0 5.137424 8.649357
#Bayes factor on the log
1/ttestBF(log(df1$pf[df1$condition==2]), log(df1$pf[df1$condition==1]),
nullInterval = c(0, Inf)) denominator
numerator Alt., r=0.707 0<d<Inf Alt., r=0.707 !(0<d<Inf)
Null, mu1-mu2=0 3.126341 11.63849
#Bayes Factor with ordinal regression
library(brms)
df1$pfo <- factor(df1$pf, ordered = TRUE)
m1 <- brm(pfo~condition, data=df1, family = cumulative(link = "logit",
threshold = "flexible"),
silent =2)
m0 <- brm(pfo~1, data=df1, family = cumulative(link = "logit",
threshold = "flexible"),
silent=2)
bayes_factor(
bridge_sampler(m1),
bridge_sampler(m0)
) -> BFBFEstimated Bayes factor in favor of x1 over x2: 0.98842
We assess the correlation with the main dependent variable in the FL group.
cor(df1$causalita[df1$condition=="2"], df1$pf[df1$condition=="2"])[1] 0.05806017
cor(df1$causalita[df1$condition=="2"], df1$pf[df1$condition=="2"],
method="kendall")[1] -0.01112788
6.2 Emotional disengegment hypothesis
6.2.1 Emotional disengegment hypothesis (descriptive)
We provide summary statistics and a visual representation of emotional intensity scale. We visualize descriptive statistics illustrating the difference between the two groups.
# Summary statistics (univariate)
library(pastecs)
summary(df1$intensita) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.5057 0.6564 0.6146 0.7454 1.0000
round(stat.desc(df1$intensita, norm = TRUE), 2) nbr.val nbr.null nbr.na min max range
220.00 3.00 0.00 0.00 1.00 1.00
sum median mean SE.mean CI.mean.0.95 var
135.20 0.66 0.61 0.01 0.03 0.04
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
0.21 0.34 -0.76 -2.33 0.39 0.59
normtest.W normtest.p
0.95 0.00
# Summary statistics (with condition i)
aggregate(df1$intensita, list(df1$condition),
function(x) round(stat.desc(x),2)) Group.1 x.nbr.val x.nbr.null x.nbr.na x.min x.max x.range x.sum x.median
1 1 110.00 2.00 0.00 0.00 1.00 1.00 66.35 0.64
2 2 110.00 1.00 0.00 0.00 1.00 1.00 68.85 0.67
x.mean x.SE.mean x.CI.mean.0.95 x.var x.std.dev x.coef.var
1 0.60 0.02 0.04 0.04 0.20 0.34
2 0.63 0.02 0.04 0.05 0.21 0.34
# Summary statistics (with condition ii)
aggregate(df1$intensita, list(df1$condition),
function(x) round(summary(x),2)) Group.1 x.Min. x.1st Qu. x.Median x.Mean x.3rd Qu. x.Max.
1 1 0.00 0.50 0.64 0.60 0.74 1.00
2 2 0.00 0.55 0.67 0.63 0.75 1.00
# Effect sizes (Cliff's Delta)
cliffs_delta(df1$intensita[df1$condition==2],
df1$intensita[df1$condition==1], pooled_sd = T)[[1]][1] 0.08454545
#Effect sizes (Cohen's d)
cohens_d(df1$intensita[df1$condition==2],
df1$intensita[df1$condition==1], pooled_sd = T)[[1]][1] 0.1090735
# Plot
ggplot(df1, aes(x = intensita)) +
geom_density(width = 0.004,
outlier.alpha = 0, fill = "darkorange", alpha = 0.5) +
labs(x = "Emotional engagement", y = "Frequencies") +
theme_bw() -> a
ggplot(df1, aes(x = intensita, fill=condition)) +
geom_density(width = 0.004, position = position_dodge(),
outlier.alpha = 0, alpha = 0.5) +
labs(x = "Emotional engagement", y = "Frequencies") +
theme_bw() -> b
a+b6.2.2 Emotional disengegment hypothesis (Inferential)
We explore if there are any differences between the two groups. We use a Bayes Factor to assess the difference between the two groups.
# Bayes Factor
1/ttestBF(df1$intensita[df1$condition==1], df1$intensita[df1$condition==2],
nullInterval = c(0, Inf)) denominator
numerator Alt., r=0.707 0<d<Inf Alt., r=0.707 !(0<d<Inf)
Null, mu1-mu2=0 11.52754 3.188544
We assess the correlation with the main dependent variable in the FL group.
cor(df1$causalita[df1$condition=="2"], df1$intensita[df1$condition=="2"])[1] -0.06305575
6.3 Prior knowledge hypothesis
6.3.1 Prior knowledge hypothesis (descriptive)
We provide summary statistics and a visual representation of prior knowledge scale. We visualize descriptive statistics illustrating the difference between the two groups.
# Summary statistics (univariate)
library(pastecs)
summary(df1$esperienza) Min. 1st Qu. Median Mean 3rd Qu. Max.
10.00 72.00 81.50 78.56 89.00 100.00
round(stat.desc(df1$esperienza, norm = TRUE), 2) nbr.val nbr.null nbr.na min max range
220.00 0.00 0.00 10.00 100.00 90.00
sum median mean SE.mean CI.mean.0.95 var
17283.00 81.50 78.56 1.00 1.98 221.65
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
14.89 0.19 -1.55 -4.73 3.35 5.14
normtest.W normtest.p
0.88 0.00
# Summary statistics (with condition i)
aggregate(df1$esperienza, list(df1$condition),
function(x) round(stat.desc(x),2)) Group.1 x.nbr.val x.nbr.null x.nbr.na x.min x.max x.range x.sum
1 1 110.00 0.00 0.00 21.00 100.00 79.00 8696.00
2 2 110.00 0.00 0.00 10.00 100.00 90.00 8587.00
x.median x.mean x.SE.mean x.CI.mean.0.95 x.var x.std.dev x.coef.var
1 81.50 79.05 1.34 2.67 198.99 14.11 0.18
2 81.50 78.06 1.50 2.96 245.86 15.68 0.20
# Summary statistics (with condition ii)
aggregate(df1$esperienza, list(df1$condition),
function(x) round(summary(x),2)) Group.1 x.Min. x.1st Qu. x.Median x.Mean x.3rd Qu. x.Max.
1 1 21.00 75.00 81.50 79.05 88.00 100.00
2 2 10.00 70.00 81.50 78.06 90.00 100.00
# Effect sizes (Cliff's Delta)
cliffs_delta(df1$esperienza[df1$condition==2],
df1$esperienza[df1$condition==1], pooled_sd = T)[[1]][1] -0.01545455
#Effect sizes (Cohen's d)
cohens_d(df1$esperienza[df1$condition==2],
df1$esperienza[df1$condition==1], pooled_sd = T)[[1]][1] -0.06644219
# Plot
ggplot(df1, aes(x = esperienza)) +
geom_density(width = 0.004,
outlier.alpha = 0, fill = "darkorange", alpha = 0.5) +
labs(x = "Prior knowledge", y = "Frequencies") +
theme_bw() -> a
ggplot(df1, aes(x = esperienza, fill=condition)) +
geom_density(width = 0.004, position = position_dodge(),
outlier.alpha = 0, alpha = 0.5) +
labs(x = "Prior knowledge", y = "Frequencies") +
theme_bw() -> b
a+b6.3.2 Prior knowledge hypothesis (Inferential)
We explore if there are any differences between the two groups. We use a Bayes Factor to assess the difference between the two groups.
# Bayes Factor
1/ttestBF(df1$esperienza[df1$condition==1], df1$esperienza[df1$condition==2],
nullInterval = c(0, Inf)) denominator
numerator Alt., r=0.707 0<d<Inf Alt., r=0.707 !(0<d<Inf)
Null, mu1-mu2=0 4.434955 9.57611
We assess the correlation with the main dependent variable in the FL group.
cor(df1$causalita[df1$condition=="2"], df1$esperienza[df1$condition=="2"])[1] 0.04750349
6.4 Memory hypothesis
6.4.1 Memory hypothesis (descriptive)
We provide summary statistics and a visual representation of the numerical evaluation of the “a” trial. We visualize descriptive statistics illustrating the difference between the two groups.
# Summary statistics (univariate)
library(pastecs)
summary(df1$trial_ss_e) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 12.00 17.00 21.06 20.00 90.00
round(stat.desc(df1$trial_ss_e, norm = TRUE), 2) nbr.val nbr.null nbr.na min max range
220.00 1.00 0.00 0.00 90.00 90.00
sum median mean SE.mean CI.mean.0.95 var
4634.00 17.00 21.06 1.01 1.99 224.75
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
14.99 0.71 2.20 6.70 5.14 7.86
normtest.W normtest.p
0.75 0.00
# Summary statistics (with condition i)
aggregate(df1$trial_ss_e, list(df1$condition),
function(x) round(stat.desc(x),2)) Group.1 x.nbr.val x.nbr.null x.nbr.na x.min x.max x.range x.sum
1 1 110.00 0.00 0.00 4.00 80.00 76.00 2145.00
2 2 110.00 1.00 0.00 0.00 90.00 90.00 2489.00
x.median x.mean x.SE.mean x.CI.mean.0.95 x.var x.std.dev x.coef.var
1 17.00 19.50 1.21 2.39 159.94 12.65 0.65
2 17.00 22.63 1.61 3.20 286.69 16.93 0.75
# Summary statistics (with condition ii)
aggregate(df1$trial_ss_e, list(df1$condition),
function(x) round(summary(x),2)) Group.1 x.Min. x.1st Qu. x.Median x.Mean x.3rd Qu. x.Max.
1 1 4.00 12.00 17.00 19.50 20.00 80.00
2 2 0.00 12.00 17.00 22.63 22.75 90.00
# Effect sizes (Cliff's Delta)
cliffs_delta(df1$trial_ss_e[df1$condition==2],
df1$trial_ss_e[df1$condition==1], pooled_sd = T)[[1]][1] 0.04975207
#Effect sizes (Cohen's d)
cohens_d(df1$trial_ss_e[df1$condition==2],
df1$trial_ss_e[df1$condition==1], pooled_sd = T)[[1]][1] 0.2092687
# Plot
ggplot(df1, aes(x = trial_ss_e)) +
geom_density(width = 0.004,
outlier.alpha = 0, fill = "darkorange", alpha = 0.5) +
labs(x = "Trial a", y = "Frequencies") +
theme_bw() -> a
ggplot(df1, aes(x = trial_ss_e, fill=condition)) +
geom_density(width = 0.004, position = position_dodge(),
outlier.alpha = 0, alpha = 0.5) +
labs(x = "Trial a", y = "Frequencies") +
theme_bw() -> b
a+bWe provide summary statistics and a visual representation of the numerical evaluation of all trials. We visualize descriptive statistics illustrating the difference between the two groups.
# Summary statistics (univariate)
df1$tt <- df1$trial_nn_e + df1$trial_ns_e + df1$trial_sn_e + df1$trial_ss_e
summary(df1$tt) Min. 1st Qu. Median Mean 3rd Qu. Max.
17.00 40.00 47.00 57.61 60.25 270.00
round(stat.desc(df1$tt, norm = TRUE), 2) nbr.val nbr.null nbr.na min max range
220.00 0.00 0.00 17.00 270.00 253.00
sum median mean SE.mean CI.mean.0.95 var
12675.00 47.00 57.61 2.45 4.82 1316.31
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
36.28 0.63 3.09 9.43 11.49 17.59
normtest.W normtest.p
0.67 0.00
# Summary statistics (with condition i)
aggregate(df1$tt, list(df1$condition),
function(x) round(stat.desc(x),2)) Group.1 x.nbr.val x.nbr.null x.nbr.na x.min x.max x.range x.sum
1 1 110.00 0.00 0.00 17.00 240.00 223.00 5999.00
2 2 110.00 0.00 0.00 20.00 270.00 250.00 6676.00
x.median x.mean x.SE.mean x.CI.mean.0.95 x.var x.std.dev x.coef.var
1 46.00 54.54 2.98 5.90 974.86 31.22 0.57
2 47.00 60.69 3.87 7.68 1650.73 40.63 0.67
# Summary statistics (with condition ii)
aggregate(df1$tt, list(df1$condition),
function(x) round(summary(x),2)) Group.1 x.Min. x.1st Qu. x.Median x.Mean x.3rd Qu. x.Max.
1 1 17.00 40.00 46.00 54.54 58.50 240.00
2 2 20.00 39.25 47.00 60.69 64.75 270.00
# Effect sizes (Cliff's Delta)
cliffs_delta(df1$tt[df1$condition==2],
df1$tt[df1$condition==1], pooled_sd = T)[[1]][1] 0.06743802
#Effect sizes (Cohen's d)
cohens_d(df1$tt[df1$condition==2],
df1$tt[df1$condition==1], pooled_sd = T)[[1]][1] 0.1698626
# Plot
ggplot(df1, aes(x = tt)) +
geom_density(width = 0.004,
outlier.alpha = 0, fill = "darkorange", alpha = 0.5) +
labs(x = "Trials", y = "Frequencies") +
theme_bw() -> a
ggplot(df1, aes(x = tt, fill=condition)) +
geom_density(width = 0.004, position = position_dodge(),
outlier.alpha = 0, alpha = 0.5) +
labs(x = "Trials", y = "Frequencies") +
theme_bw() -> b
a+b6.4.2 Memory hypothesis (Inferential)
We explore if there are any differences between the two groups. We use a Bayes Factor to assess the difference between the two groups.
# Bayes Factor
1/ttestBF(df1$trial_ss_e[df1$condition==1], df1$trial_ss_e[df1$condition==2],
nullInterval = c(0,Inf)) denominator
numerator Alt., r=0.707 0<d<Inf Alt., r=0.707 !(0<d<Inf)
Null, mu1-mu2=0 16.46285 1.175919
#Bayes Factor Total trials
1/ttestBF(df1$tt[df1$condition==1], df1$tt[df1$condition==2],
nullInterval = c(0,Inf)) denominator
numerator Alt., r=0.707 0<d<Inf Alt., r=0.707 !(0<d<Inf)
Null, mu1-mu2=0 14.47553 1.814336
# Bayes Factor in log
df1$trial_ss_eL <- log(df1$trial_ss_e)
1/ttestBF(df1$trial_ss_e[df1$condition==1], df1$trial_ss_e[df1$condition==2],
nullInterval = c(0,Inf)) denominator
numerator Alt., r=0.707 0<d<Inf Alt., r=0.707 !(0<d<Inf)
Null, mu1-mu2=0 16.46285 1.175919
#Bayes Factor Total trials in log
df1$ttL <- log(df1$tt)
1/ttestBF(df1$tt[df1$condition==1], df1$tt[df1$condition==2],
nullInterval = c(0,Inf)) denominator
numerator Alt., r=0.707 0<d<Inf Alt., r=0.707 !(0<d<Inf)
Null, mu1-mu2=0 14.47553 1.814336
# When using this, we identify a difference, but in the opposite direction
m1 <- brm(trial_ss_e+1~condition, data=df1, family = Gamma(link="identity"),
silent=2)
m0 <- brm(trial_ss_e+1~1, data=df1, family = Gamma(link="identity"),
silent = 2)
bayes_factor(
bridge_sampler(m1),
bridge_sampler(m0)
) -> BFss
#Bayes Factor Total trials
1/ttestBF(df1$tt[df1$condition==1], df1$tt[df1$condition==2])
m1x <- brm(tt~condition, data=df1, family = Gamma(link="identity"),silent = 2)
m0 <- brm(tt~1, data=df1, family = Gamma(link="identity"),silent = 2)
bayes_factor(
bridge_sampler(m1x),
bridge_sampler(m0)
) -> BF_ttBF_tt; BFssEstimated Bayes factor in favor of x1 over x2: 36.19786
Estimated Bayes factor in favor of x1 over x2: 23.53821
bayestestR::pd(m1)Probability of Direction
Parameter | pd
--------------------
(Intercept) | 100%
condition2 | 96.23%
bayestestR::pd(m1x)Probability of Direction
Parameter | pd
--------------------
(Intercept) | 100%
condition2 | 94.45%
We assess the correlation with the main dependent variable in the FL group.
# Type Pearson
cor(df1$causalita[df1$condition=="2"], df1$tt[df1$condition=="2"])[1] 0.2163476
cor(df1$causalita[df1$condition=="2"], df1$trial_ss_e[df1$condition=="2"])[1] 0.3593487
cor(df1$causalita[df1$condition=="2"], df1$trial_sn_e[df1$condition=="2"])[1] 0.1015263
cor(df1$causalita[df1$condition=="2"], df1$trial_ns_e[df1$condition=="2"])[1] 0.03147821
cor(df1$causalita[df1$condition=="2"], df1$trial_nn_e[df1$condition=="2"])[1] 0.1159261
# Type Kendall method = "kendall"
cor(df1$causalita[df1$condition=="2"], df1$tt[df1$condition=="2"],
method = "kendall")[1] 0.1421889
cor(df1$causalita[df1$condition=="2"], df1$trial_ss_e[df1$condition=="2"],
method = "kendall")[1] 0.3180543
cor(df1$causalita[df1$condition=="2"], df1$trial_sn_e[df1$condition=="2"],
method = "kendall")[1] -0.06591999
cor(df1$causalita[df1$condition=="2"], df1$trial_ns_e[df1$condition=="2"],
method = "kendall")[1] -0.04646215
cor(df1$causalita[df1$condition=="2"], df1$trial_nn_e[df1$condition=="2"],
method = "kendall")[1] 0.04731497
6.5 Combining the different hypotheses in a BF linear model for the FL group (EXTRA)
sum(as.vector(generalTestBF(formula = causalita ~ pf *
esperienza * intensita *
trial_nn_e, data = df1[df1$condition==2,])) > 1)[1] 0
6.6 Objective disfluency (EXTRA)
df1$t_causalita + df1$t_istruzioni + df1$t_sam +
df1$t_trial + df1$t_esperienza + df1$t_task + df1$t_pf -> df1$t_obj
by(df1$t_obj, df1$condition, DESC)df1$condition: 1
nbr.val nbr.null nbr.na min max range
110.00 0.00 0.00 257.80 1852.52 1594.71
sum median mean SE.mean CI.mean.0.95 var
52372.29 416.20 476.11 19.90 39.44 43547.92
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
208.68 0.44 3.48 7.54 16.96 18.55
normtest.W normtest.p
0.67 0.00
------------------------------------------------------------
df1$condition: 2
nbr.val nbr.null nbr.na min max range
110.00 0.00 0.00 271.41 1196.70 925.29
sum median mean SE.mean CI.mean.0.95 var
55315.75 440.17 502.87 18.12 35.92 36131.60
std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
190.08 0.38 1.53 3.32 2.35 2.57
normtest.W normtest.p
0.85 0.00
hist(df1$t_obj)1/ttestBF(df1$t_obj[df1$condition=="2"], df1$t_obj[df1$condition=="1"],
nullInterval = c(0,Inf)) denominator
numerator Alt., r=0.707 0<d<Inf Alt., r=0.707 !(0<d<Inf)
Null, mu1-mu2=0 2.565379 12.71902
summary(glm(t_obj~condition, df1, family=Gamma(link="identity")))
Call:
glm(formula = t_obj ~ condition, family = Gamma(link = "identity"),
data = df1)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 476.11 18.58 25.63 <2e-16 ***
condition2 26.76 27.02 0.99 0.323
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Gamma family taken to be 0.1674955)
Null deviance: 26.320 on 219 degrees of freedom
Residual deviance: 26.155 on 218 degrees of freedom
AIC: 2864.9
Number of Fisher Scoring iterations: 3
df1$t_obj <- log(df1$t_obj)
1/ttestBF(df1$t_obj[df1$condition=="2"], df1$t_obj[df1$condition=="1"],
nullInterval = c(0,Inf)) denominator
numerator Alt., r=0.707 0<d<Inf Alt., r=0.707 !(0<d<Inf)
Null, mu1-mu2=0 1.753833 14.63894